bettafish-company/WeiboSentiment_MachineLearning/4.lstm.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "cell_id": 40
   },
   "source": [
    "### 加载数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "cell_id": 1
   },
   "outputs": [],
   "source": [
    "from utils import load_corpus, stopwords\n",
    "\n",
    "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n",
    "TEST_PATH = \"./data/weibo2018/test.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "cell_id": 2
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.826 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "# 分别加载训练集和测试集\n",
    "train_data = load_corpus(TRAIN_PATH)\n",
    "test_data = load_corpus(TEST_PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "cell_id": 3
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  label\n",
       "0  书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹...      1\n",
       "1  这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑...      0\n",
       "2  中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ...      1\n",
       "3  看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十...      1\n",
       "4  汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历...      1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_train = pd.DataFrame(train_data, columns=[\"text\", \"label\"])\n",
    "df_test = pd.DataFrame(test_data, columns=[\"text\", \"label\"])\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "cell_id": 42
   },
   "source": [
    "### 训练词向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "cell_id": 44
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    [书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...\n",
       "1    [这是, 英超, 被, 黑, 的, 最惨, 的, 一次, 二哈, 二哈, 十几年来, 中国,...\n",
       "2    [中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...\n",
       "3    [看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...\n",
       "4    [汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...\n",
       "Name: text, dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# word2vec要求的输入格式: list(word)\n",
    "wv_input = df_train['text'].map(lambda s: s.split(\" \"))   # [for w in s.split(\" \") if w not in stopwords]\n",
    "wv_input.head()                         "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "cell_id": 4
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tiger/.local/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
      "  warnings.warn(msg)\n"
     ]
    }
   ],
   "source": [
    "from gensim import models\n",
    "\n",
    "# Word2Vec\n",
    "word2vec = models.Word2Vec(wv_input, \n",
    "                           vector_size=64,   # 词向量维度\n",
    "                           min_count=1,      # 最小词频, 因为数据量较小, 这里卡1\n",
    "                           epochs=1000)      # 迭代轮次"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "cell_id": 46
   },
   "source": [
    "查找近义词, 直观感受训练得到的word2vec效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "cell_id": 5,
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('我', 0.9441561102867126),\n",
       " ('自己', 0.8928312659263611),\n",
       " ('他', 0.8796129822731018),\n",
       " ('的', 0.8601957559585571),\n",
       " ('她', 0.855070948600769),\n",
       " ('人', 0.8349815607070923),\n",
       " ('都', 0.8168802261352539),\n",
       " ('了', 0.8017680644989014),\n",
       " ('就', 0.7990766763687134),\n",
       " ('也', 0.7883183360099792)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2vec.wv.most_similar(\"你\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "cell_id": 38
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('哈哈哈', 0.6309624910354614),\n",
       " ('啦', 0.5457888841629028),\n",
       " ('可爱', 0.5375339984893799),\n",
       " ('了', 0.4885959327220917),\n",
       " ('本柔', 0.46517741680145264),\n",
       " ('笑', 0.4639575779438019),\n",
       " ('哈哈哈哈', 0.45851588249206543),\n",
       " ('心虚', 0.4576280415058136),\n",
       " ('又', 0.45520466566085815),\n",
       " ('呀', 0.4494859576225281)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2vec.wv.most_similar(\"哈哈\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "cell_id": 39
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('难过', 0.724579393863678),\n",
       " ('哭', 0.6421604752540588),\n",
       " ('想', 0.6415957808494568),\n",
       " ('也', 0.6394745707511902),\n",
       " ('真的', 0.6263709664344788),\n",
       " ('我', 0.6136066317558289),\n",
       " ('都', 0.608888566493988),\n",
       " ('的', 0.6078368425369263),\n",
       " ('就', 0.5916700959205627),\n",
       " ('开心', 0.5899774432182312)]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2vec.wv.most_similar(\"伤心\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "cell_id": 48
   },
   "source": [
    "### 神经网络"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "cell_id": 14
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch import nn\n",
    "from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "\n",
    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "cell_id": 19
   },
   "outputs": [],
   "source": [
    "# 超参数\n",
    "learning_rate = 5e-4\n",
    "input_size = 768\n",
    "num_epoches = 5\n",
    "batch_size = 100\n",
    "embed_size = 64\n",
    "hidden_size = 64\n",
    "num_layers = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "cell_id": 7
   },
   "outputs": [],
   "source": [
    "# 数据集\n",
    "class MyDataset(Dataset):\n",
    "    def __init__(self, df):\n",
    "        self.data = []\n",
    "        self.label = df[\"label\"].tolist()\n",
    "        for s in df[\"text\"].tolist():\n",
    "            vectors = []\n",
    "            for w in s.split(\" \"):\n",
    "                if w in word2vec.wv.key_to_index:\n",
    "                    vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量\n",
    "            vectors = torch.Tensor(vectors)\n",
    "            self.data.append(vectors)\n",
    "    \n",
    "    def __getitem__(self, index):\n",
    "        data = self.data[index]\n",
    "        label = self.label[index]\n",
    "        return data, label\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.label)\n",
    "\n",
    "def collate_fn(data):\n",
    "    \"\"\"\n",
    "    :param data: 第0维：data，第1维：label\n",
    "    :return: 序列化的data、记录实际长度的序列、以及label列表\n",
    "    \"\"\"\n",
    "    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列\n",
    "    data_length = [len(sq[0]) for sq in data]\n",
    "    x = [i[0] for i in data]\n",
    "    y = [i[1] for i in data]\n",
    "    data = pad_sequence(x, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作\n",
    "    return data, torch.tensor(y, dtype=torch.float32), data_length\n",
    "\n",
    "\n",
    "# 训练集\n",
    "train_data = MyDataset(df_train)\n",
    "train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)\n",
    "\n",
    "# 测试集\n",
    "test_data = MyDataset(df_test)\n",
    "test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "cell_id": 11
   },
   "outputs": [],
   "source": [
    "# 网络结构\n",
    "class LSTM(nn.Module):\n",
    "    def __init__(self, input_size, hidden_size, num_layers):\n",
    "        super(LSTM, self).__init__()\n",
    "        self.hidden_size = hidden_size\n",
    "        self.num_layers = num_layers\n",
    "        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)\n",
    "        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2\n",
    "        self.sigmoid = nn.Sigmoid()\n",
    "\n",
    "    def forward(self, x, lengths):\n",
    "        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2\n",
    "        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)\n",
    "        \n",
    "        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)\n",
    "        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))\n",
    "\n",
    "        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出\n",
    "        out = self.fc(lstm_out)\n",
    "        out = self.sigmoid(out)\n",
    "        return out\n",
    "\n",
    "lstm = LSTM(embed_size, hidden_size, num_layers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "cell_id": 26
   },
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "# 在测试集效果检验\n",
    "def test():\n",
    "    y_pred, y_true = [], []\n",
    "\n",
    "    with torch.no_grad():\n",
    "        for x, labels, lengths in test_loader:\n",
    "            x = x.to(device)\n",
    "            outputs = lstm(x, lengths)          # 前向传播\n",
    "            outputs = outputs.view(-1)          # 将输出展平\n",
    "            y_pred.append(outputs)\n",
    "            y_true.append(labels)\n",
    "\n",
    "    y_prob = torch.cat(y_pred)\n",
    "    y_true = torch.cat(y_true)\n",
    "    y_pred = y_prob.clone()\n",
    "    y_pred[y_pred > 0.5] = 1\n",
    "    y_pred[y_pred <= 0.5] = 0\n",
    "    \n",
    "    print(metrics.classification_report(y_true, y_pred))\n",
    "    print(\"准确率:\", metrics.accuracy_score(y_true, y_pred))\n",
    "    print(\"AUC:\", metrics.roc_auc_score(y_true, y_prob) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "cell_id": 32
   },
   "outputs": [],
   "source": [
    "# 定义损失函数和优化器\n",
    "criterion = nn.BCELoss()\n",
    "optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "cell_id": 33,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch:1, step:10, loss:0.689099133014679\n",
      "epoch:1, step:20, loss:0.6717442870140076\n",
      "epoch:1, step:30, loss:0.650161862373352\n",
      "epoch:1, step:40, loss:0.5935518741607666\n",
      "epoch:1, step:50, loss:0.4994719922542572\n",
      "epoch:1, step:60, loss:0.4774974286556244\n",
      "epoch:1, step:70, loss:0.482360303401947\n",
      "epoch:1, step:80, loss:0.44858306646347046\n",
      "epoch:1, step:90, loss:0.4513603746891022\n",
      "epoch:1, step:100, loss:0.4386572241783142\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.75      0.80      0.78       155\n",
      "         1.0       0.91      0.88      0.89       345\n",
      "\n",
      "    accuracy                           0.86       500\n",
      "   macro avg       0.83      0.84      0.83       500\n",
      "weighted avg       0.86      0.86      0.86       500\n",
      "\n",
      "准确率: 0.856\n",
      "AUC: 0.9141841982234689\n",
      "saved model:  ./model/lstm_1.model\n",
      "epoch:2, step:10, loss:0.4317778944969177\n",
      "epoch:2, step:20, loss:0.41387200355529785\n",
      "epoch:2, step:30, loss:0.4237545430660248\n",
      "epoch:2, step:40, loss:0.364933043718338\n",
      "epoch:2, step:50, loss:0.37595903873443604\n",
      "epoch:2, step:60, loss:0.4067295491695404\n",
      "epoch:2, step:70, loss:0.41071224212646484\n",
      "epoch:2, step:80, loss:0.39134103059768677\n",
      "epoch:2, step:90, loss:0.37907883524894714\n",
      "epoch:2, step:100, loss:0.4322803020477295\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.80      0.63      0.71       155\n",
      "         1.0       0.85      0.93      0.89       345\n",
      "\n",
      "    accuracy                           0.84       500\n",
      "   macro avg       0.83      0.78      0.80       500\n",
      "weighted avg       0.83      0.84      0.83       500\n",
      "\n",
      "准确率: 0.838\n",
      "AUC: 0.9174193548387096\n",
      "saved model:  ./model/lstm_2.model\n",
      "epoch:3, step:10, loss:0.37696003913879395\n",
      "epoch:3, step:20, loss:0.36385685205459595\n",
      "epoch:3, step:30, loss:0.3907310664653778\n",
      "epoch:3, step:40, loss:0.35576874017715454\n",
      "epoch:3, step:50, loss:0.36152324080467224\n",
      "epoch:3, step:60, loss:0.3620041608810425\n",
      "epoch:3, step:70, loss:0.32647013664245605\n",
      "epoch:3, step:80, loss:0.38903307914733887\n",
      "epoch:3, step:90, loss:0.34238141775131226\n",
      "epoch:3, step:100, loss:0.3952549397945404\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.75      0.79      0.77       155\n",
      "         1.0       0.90      0.88      0.89       345\n",
      "\n",
      "    accuracy                           0.85       500\n",
      "   macro avg       0.83      0.84      0.83       500\n",
      "weighted avg       0.86      0.85      0.85       500\n",
      "\n",
      "准确率: 0.854\n",
      "AUC: 0.9280411407199626\n",
      "saved model:  ./model/lstm_3.model\n",
      "epoch:4, step:10, loss:0.34902292490005493\n",
      "epoch:4, step:20, loss:0.3277026116847992\n",
      "epoch:4, step:30, loss:0.32119297981262207\n",
      "epoch:4, step:40, loss:0.34501412510871887\n",
      "epoch:4, step:50, loss:0.3202686905860901\n",
      "epoch:4, step:60, loss:0.3599391579627991\n",
      "epoch:4, step:70, loss:0.2958642542362213\n",
      "epoch:4, step:80, loss:0.3152882158756256\n",
      "epoch:4, step:90, loss:0.3151417374610901\n",
      "epoch:4, step:100, loss:0.3314781188964844\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.78      0.81      0.79       155\n",
      "         1.0       0.91      0.90      0.90       345\n",
      "\n",
      "    accuracy                           0.87       500\n",
      "   macro avg       0.84      0.85      0.85       500\n",
      "weighted avg       0.87      0.87      0.87       500\n",
      "\n",
      "准确率: 0.868\n",
      "AUC: 0.9314258999532491\n",
      "saved model:  ./model/lstm_4.model\n",
      "epoch:5, step:10, loss:0.2638005316257477\n",
      "epoch:5, step:20, loss:0.3028942048549652\n",
      "epoch:5, step:30, loss:0.2819410562515259\n",
      "epoch:5, step:40, loss:0.2857419550418854\n",
      "epoch:5, step:50, loss:0.3177730441093445\n",
      "epoch:5, step:60, loss:0.3140687346458435\n",
      "epoch:5, step:70, loss:0.32480892539024353\n",
      "epoch:5, step:80, loss:0.2964351177215576\n",
      "epoch:5, step:90, loss:0.27567631006240845\n",
      "epoch:5, step:100, loss:0.2848973870277405\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.83      0.74      0.78       155\n",
      "         1.0       0.89      0.93      0.91       345\n",
      "\n",
      "    accuracy                           0.87       500\n",
      "   macro avg       0.86      0.83      0.84       500\n",
      "weighted avg       0.87      0.87      0.87       500\n",
      "\n",
      "准确率: 0.87\n",
      "AUC: 0.9310892940626461\n",
      "saved model:  ./model/lstm_5.model\n"
     ]
    }
   ],
   "source": [
    "# 迭代训练\n",
    "for epoch in range(num_epoches):\n",
    "    total_loss = 0\n",
    "    for i, (x, labels, lengths) in enumerate(train_loader):\n",
    "        x = x.to(device)\n",
    "        labels = labels.to(device)\n",
    "        outputs = lstm(x, lengths)          # 前向传播\n",
    "        logits = outputs.view(-1)           # 将输出展平\n",
    "        loss = criterion(logits, labels)    # loss计算\n",
    "        total_loss += loss\n",
    "        optimizer.zero_grad()               # 梯度清零\n",
    "        loss.backward(retain_graph=True)    # 反向传播，计算梯度\n",
    "        optimizer.step()                    # 梯度更新\n",
    "        if (i+1) % 10 == 0:\n",
    "            print(\"epoch:{}, step:{}, loss:{}\".format(epoch+1, i+1, total_loss/10))\n",
    "            total_loss = 0\n",
    "    \n",
    "    # test\n",
    "    test()\n",
    "    \n",
    "    # save model\n",
    "    model_path = \"./model/lstm_{}.model\".format(epoch+1)\n",
    "    torch.save(lstm, model_path)\n",
    "    print(\"saved model: \", model_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "cell_id": 36
   },
   "source": [
    "### 手动输入句子，判断情感倾向（1正/0负）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "cell_id": 51
   },
   "outputs": [],
   "source": [
    "net = torch.load(\"./model/lstm_5.model\")    # 训练过程中的巅峰时刻"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "cell_id": 52
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([0.9657, 0.3921])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from utils import processing\n",
    "\n",
    "strs = [\"我想说我会爱你多一点点\", \"日有所思梦感伤\"]\n",
    "\n",
    "data = []\n",
    "for s in strs:\n",
    "    vectors = []\n",
    "    for w in processing(s).split(\" \"):\n",
    "        if w in word2vec.wv.key_to_index:\n",
    "            vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量\n",
    "    vectors = torch.Tensor(vectors)\n",
    "    data.append(vectors)\n",
    "x, _, lengths = collate_fn(list(zip(data, [-1] * len(strs))))\n",
    "with torch.no_grad():\n",
    "    x = x.to(device)\n",
    "    outputs = lstm(x, lengths)          # 前向传播\n",
    "    outputs = outputs.view(-1)          # 将输出展平\n",
    "outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cell_id": 54
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "max_cell_id": 55
 },
 "nbformat": 4,
 "nbformat_minor": 5
}