{ "cells": [ { "cell_type": "markdown", "metadata": { "cell_id": 40 }, "source": [ "### 加载数据集" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "cell_id": 1 }, "outputs": [], "source": [ "from utils import load_corpus, stopwords\n", "\n", "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n", "TEST_PATH = \"./data/weibo2018/test.txt\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "cell_id": 2 }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n", "Loading model from cache /tmp/jieba.cache\n", "Loading model cost 0.826 seconds.\n", "Prefix dict has been built successfully.\n" ] } ], "source": [ "# 分别加载训练集和测试集\n", "train_data = load_corpus(TRAIN_PATH)\n", "test_data = load_corpus(TEST_PATH)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "cell_id": 3 }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabel
0书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹...1
1这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑...0
2中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ...1
3看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十...1
4汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历...1
\n", "
" ], "text/plain": [ " text label\n", "0 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... 1\n", "1 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... 0\n", "2 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... 1\n", "3 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... 1\n", "4 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... 1" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df_train = pd.DataFrame(train_data, columns=[\"text\", \"label\"])\n", "df_test = pd.DataFrame(test_data, columns=[\"text\", \"label\"])\n", "df_train.head()" ] }, { "cell_type": "markdown", "metadata": { "cell_id": 42 }, "source": [ "### 训练词向量" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "cell_id": 44 }, "outputs": [ { "data": { "text/plain": [ "0 [书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...\n", "1 [这是, 英超, 被, 黑, 的, 最惨, 的, 一次, 二哈, 二哈, 十几年来, 中国,...\n", "2 [中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...\n", "3 [看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...\n", "4 [汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...\n", "Name: text, dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# word2vec要求的输入格式: list(word)\n", "wv_input = df_train['text'].map(lambda s: s.split(\" \")) # [for w in s.split(\" \") if w not in stopwords]\n", "wv_input.head() " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "cell_id": 4 }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/tiger/.local/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n" ] } ], "source": [ "from gensim import models\n", "\n", "# Word2Vec\n", "word2vec = models.Word2Vec(wv_input, \n", " vector_size=64, # 词向量维度\n", " min_count=1, # 最小词频, 因为数据量较小, 这里卡1\n", " epochs=1000) # 迭代轮次" ] }, { "cell_type": "markdown", "metadata": { "cell_id": 46 }, "source": [ "查找近义词, 直观感受训练得到的word2vec效果" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "cell_id": 5, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[('我', 0.9441561102867126),\n", " ('自己', 0.8928312659263611),\n", " ('他', 0.8796129822731018),\n", " ('的', 0.8601957559585571),\n", " ('她', 0.855070948600769),\n", " ('人', 0.8349815607070923),\n", " ('都', 0.8168802261352539),\n", " ('了', 0.8017680644989014),\n", " ('就', 0.7990766763687134),\n", " ('也', 0.7883183360099792)]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word2vec.wv.most_similar(\"你\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "cell_id": 38 }, "outputs": [ { "data": { "text/plain": [ "[('哈哈哈', 0.6309624910354614),\n", " ('啦', 0.5457888841629028),\n", " ('可爱', 0.5375339984893799),\n", " ('了', 0.4885959327220917),\n", " ('本柔', 0.46517741680145264),\n", " ('笑', 0.4639575779438019),\n", " ('哈哈哈哈', 0.45851588249206543),\n", " ('心虚', 0.4576280415058136),\n", " ('又', 0.45520466566085815),\n", " ('呀', 0.4494859576225281)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word2vec.wv.most_similar(\"哈哈\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "cell_id": 39 }, "outputs": [ { "data": { "text/plain": [ "[('难过', 0.724579393863678),\n", " ('哭', 0.6421604752540588),\n", " ('想', 0.6415957808494568),\n", " ('也', 0.6394745707511902),\n", " ('真的', 0.6263709664344788),\n", " ('我', 0.6136066317558289),\n", " ('都', 0.608888566493988),\n", " ('的', 0.6078368425369263),\n", " ('就', 0.5916700959205627),\n", " ('开心', 0.5899774432182312)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word2vec.wv.most_similar(\"伤心\")" ] }, { "cell_type": "markdown", "metadata": { "cell_id": 48 }, "source": [ "### 神经网络" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "cell_id": 14 }, "outputs": [], "source": [ "import torch\n", "from torch import nn\n", "from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence\n", "from torch.utils.data import Dataset, DataLoader\n", "\n", "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "cell_id": 19 }, "outputs": [], "source": [ "# 超参数\n", "learning_rate = 5e-4\n", "input_size = 768\n", "num_epoches = 5\n", "batch_size = 100\n", "embed_size = 64\n", "hidden_size = 64\n", "num_layers = 2" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "cell_id": 7 }, "outputs": [], "source": [ "# 数据集\n", "class MyDataset(Dataset):\n", " def __init__(self, df):\n", " self.data = []\n", " self.label = df[\"label\"].tolist()\n", " for s in df[\"text\"].tolist():\n", " vectors = []\n", " for w in s.split(\" \"):\n", " if w in word2vec.wv.key_to_index:\n", " vectors.append(word2vec.wv[w]) # 将每个词替换为对应的词向量\n", " vectors = torch.Tensor(vectors)\n", " self.data.append(vectors)\n", " \n", " def __getitem__(self, index):\n", " data = self.data[index]\n", " label = self.label[index]\n", " return data, label\n", "\n", " def __len__(self):\n", " return len(self.label)\n", "\n", "def collate_fn(data):\n", " \"\"\"\n", " :param data: 第0维:data,第1维:label\n", " :return: 序列化的data、记录实际长度的序列、以及label列表\n", " \"\"\"\n", " data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列\n", " data_length = [len(sq[0]) for sq in data]\n", " x = [i[0] for i in data]\n", " y = [i[1] for i in data]\n", " data = pad_sequence(x, batch_first=True, padding_value=0) # 用RNN处理变长序列的必要操作\n", " return data, torch.tensor(y, dtype=torch.float32), data_length\n", "\n", "\n", "# 训练集\n", "train_data = MyDataset(df_train)\n", "train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)\n", "\n", "# 测试集\n", "test_data = MyDataset(df_test)\n", "test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "cell_id": 11 }, "outputs": [], "source": [ "# 网络结构\n", "class LSTM(nn.Module):\n", " def __init__(self, input_size, hidden_size, num_layers):\n", " super(LSTM, self).__init__()\n", " self.hidden_size = hidden_size\n", " self.num_layers = num_layers\n", " self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)\n", " self.fc = nn.Linear(hidden_size * 2, 1) # 双向, 输出维度要*2\n", " self.sigmoid = nn.Sigmoid()\n", "\n", " def forward(self, x, lengths):\n", " h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) # 双向, 第一个维度要*2\n", " c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)\n", " \n", " packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)\n", " packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))\n", "\n", " lstm_out = torch.cat([h_n[-2], h_n[-1]], 1) # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出\n", " out = self.fc(lstm_out)\n", " out = self.sigmoid(out)\n", " return out\n", "\n", "lstm = LSTM(embed_size, hidden_size, num_layers)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "cell_id": 26 }, "outputs": [], "source": [ "from sklearn import metrics\n", "\n", "# 在测试集效果检验\n", "def test():\n", " y_pred, y_true = [], []\n", "\n", " with torch.no_grad():\n", " for x, labels, lengths in test_loader:\n", " x = x.to(device)\n", " outputs = lstm(x, lengths) # 前向传播\n", " outputs = outputs.view(-1) # 将输出展平\n", " y_pred.append(outputs)\n", " y_true.append(labels)\n", "\n", " y_prob = torch.cat(y_pred)\n", " y_true = torch.cat(y_true)\n", " y_pred = y_prob.clone()\n", " y_pred[y_pred > 0.5] = 1\n", " y_pred[y_pred <= 0.5] = 0\n", " \n", " print(metrics.classification_report(y_true, y_pred))\n", " print(\"准确率:\", metrics.accuracy_score(y_true, y_pred))\n", " print(\"AUC:\", metrics.roc_auc_score(y_true, y_prob) )" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "cell_id": 32 }, "outputs": [], "source": [ "# 定义损失函数和优化器\n", "criterion = nn.BCELoss()\n", "optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "cell_id": 33, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch:1, step:10, loss:0.689099133014679\n", "epoch:1, step:20, loss:0.6717442870140076\n", "epoch:1, step:30, loss:0.650161862373352\n", "epoch:1, step:40, loss:0.5935518741607666\n", "epoch:1, step:50, loss:0.4994719922542572\n", "epoch:1, step:60, loss:0.4774974286556244\n", "epoch:1, step:70, loss:0.482360303401947\n", "epoch:1, step:80, loss:0.44858306646347046\n", "epoch:1, step:90, loss:0.4513603746891022\n", "epoch:1, step:100, loss:0.4386572241783142\n", " precision recall f1-score support\n", "\n", " 0.0 0.75 0.80 0.78 155\n", " 1.0 0.91 0.88 0.89 345\n", "\n", " accuracy 0.86 500\n", " macro avg 0.83 0.84 0.83 500\n", "weighted avg 0.86 0.86 0.86 500\n", "\n", "准确率: 0.856\n", "AUC: 0.9141841982234689\n", "saved model: ./model/lstm_1.model\n", "epoch:2, step:10, loss:0.4317778944969177\n", "epoch:2, step:20, loss:0.41387200355529785\n", "epoch:2, step:30, loss:0.4237545430660248\n", "epoch:2, step:40, loss:0.364933043718338\n", "epoch:2, step:50, loss:0.37595903873443604\n", "epoch:2, step:60, loss:0.4067295491695404\n", "epoch:2, step:70, loss:0.41071224212646484\n", "epoch:2, step:80, loss:0.39134103059768677\n", "epoch:2, step:90, loss:0.37907883524894714\n", "epoch:2, step:100, loss:0.4322803020477295\n", " precision recall f1-score support\n", "\n", " 0.0 0.80 0.63 0.71 155\n", " 1.0 0.85 0.93 0.89 345\n", "\n", " accuracy 0.84 500\n", " macro avg 0.83 0.78 0.80 500\n", "weighted avg 0.83 0.84 0.83 500\n", "\n", "准确率: 0.838\n", "AUC: 0.9174193548387096\n", "saved model: ./model/lstm_2.model\n", "epoch:3, step:10, loss:0.37696003913879395\n", "epoch:3, step:20, loss:0.36385685205459595\n", "epoch:3, step:30, loss:0.3907310664653778\n", "epoch:3, step:40, loss:0.35576874017715454\n", "epoch:3, step:50, loss:0.36152324080467224\n", "epoch:3, step:60, loss:0.3620041608810425\n", "epoch:3, step:70, loss:0.32647013664245605\n", "epoch:3, step:80, loss:0.38903307914733887\n", "epoch:3, step:90, loss:0.34238141775131226\n", "epoch:3, step:100, loss:0.3952549397945404\n", " precision recall f1-score support\n", "\n", " 0.0 0.75 0.79 0.77 155\n", " 1.0 0.90 0.88 0.89 345\n", "\n", " accuracy 0.85 500\n", " macro avg 0.83 0.84 0.83 500\n", "weighted avg 0.86 0.85 0.85 500\n", "\n", "准确率: 0.854\n", "AUC: 0.9280411407199626\n", "saved model: ./model/lstm_3.model\n", "epoch:4, step:10, loss:0.34902292490005493\n", "epoch:4, step:20, loss:0.3277026116847992\n", "epoch:4, step:30, loss:0.32119297981262207\n", "epoch:4, step:40, loss:0.34501412510871887\n", "epoch:4, step:50, loss:0.3202686905860901\n", "epoch:4, step:60, loss:0.3599391579627991\n", "epoch:4, step:70, loss:0.2958642542362213\n", "epoch:4, step:80, loss:0.3152882158756256\n", "epoch:4, step:90, loss:0.3151417374610901\n", "epoch:4, step:100, loss:0.3314781188964844\n", " precision recall f1-score support\n", "\n", " 0.0 0.78 0.81 0.79 155\n", " 1.0 0.91 0.90 0.90 345\n", "\n", " accuracy 0.87 500\n", " macro avg 0.84 0.85 0.85 500\n", "weighted avg 0.87 0.87 0.87 500\n", "\n", "准确率: 0.868\n", "AUC: 0.9314258999532491\n", "saved model: ./model/lstm_4.model\n", "epoch:5, step:10, loss:0.2638005316257477\n", "epoch:5, step:20, loss:0.3028942048549652\n", "epoch:5, step:30, loss:0.2819410562515259\n", "epoch:5, step:40, loss:0.2857419550418854\n", "epoch:5, step:50, loss:0.3177730441093445\n", "epoch:5, step:60, loss:0.3140687346458435\n", "epoch:5, step:70, loss:0.32480892539024353\n", "epoch:5, step:80, loss:0.2964351177215576\n", "epoch:5, step:90, loss:0.27567631006240845\n", "epoch:5, step:100, loss:0.2848973870277405\n", " precision recall f1-score support\n", "\n", " 0.0 0.83 0.74 0.78 155\n", " 1.0 0.89 0.93 0.91 345\n", "\n", " accuracy 0.87 500\n", " macro avg 0.86 0.83 0.84 500\n", "weighted avg 0.87 0.87 0.87 500\n", "\n", "准确率: 0.87\n", "AUC: 0.9310892940626461\n", "saved model: ./model/lstm_5.model\n" ] } ], "source": [ "# 迭代训练\n", "for epoch in range(num_epoches):\n", " total_loss = 0\n", " for i, (x, labels, lengths) in enumerate(train_loader):\n", " x = x.to(device)\n", " labels = labels.to(device)\n", " outputs = lstm(x, lengths) # 前向传播\n", " logits = outputs.view(-1) # 将输出展平\n", " loss = criterion(logits, labels) # loss计算\n", " total_loss += loss\n", " optimizer.zero_grad() # 梯度清零\n", " loss.backward(retain_graph=True) # 反向传播,计算梯度\n", " optimizer.step() # 梯度更新\n", " if (i+1) % 10 == 0:\n", " print(\"epoch:{}, step:{}, loss:{}\".format(epoch+1, i+1, total_loss/10))\n", " total_loss = 0\n", " \n", " # test\n", " test()\n", " \n", " # save model\n", " model_path = \"./model/lstm_{}.model\".format(epoch+1)\n", " torch.save(lstm, model_path)\n", " print(\"saved model: \", model_path)" ] }, { "cell_type": "markdown", "metadata": { "cell_id": 36 }, "source": [ "### 手动输入句子,判断情感倾向(1正/0负)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "cell_id": 51 }, "outputs": [], "source": [ "net = torch.load(\"./model/lstm_5.model\") # 训练过程中的巅峰时刻" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "cell_id": 52 }, "outputs": [ { "data": { "text/plain": [ "tensor([0.9657, 0.3921])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils import processing\n", "\n", "strs = [\"我想说我会爱你多一点点\", \"日有所思梦感伤\"]\n", "\n", "data = []\n", "for s in strs:\n", " vectors = []\n", " for w in processing(s).split(\" \"):\n", " if w in word2vec.wv.key_to_index:\n", " vectors.append(word2vec.wv[w]) # 将每个词替换为对应的词向量\n", " vectors = torch.Tensor(vectors)\n", " data.append(vectors)\n", "x, _, lengths = collate_fn(list(zip(data, [-1] * len(strs))))\n", "with torch.no_grad():\n", " x = x.to(device)\n", " outputs = lstm(x, lengths) # 前向传播\n", " outputs = outputs.view(-1) # 将输出展平\n", "outputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": 54 }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "max_cell_id": 55 }, "nbformat": 4, "nbformat_minor": 5 }