{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### 加载数据集" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from utils import load_corpus, stopwords\n", "\n", "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n", "TEST_PATH = \"./data/weibo2018/test.txt\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# 分别加载训练集和测试集\n", "train_data = load_corpus(TRAIN_PATH)\n", "test_data = load_corpus(TEST_PATH)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordslabel
0书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹...1
1这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑...0
2中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ...1
3看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十...1
4汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历...1
\n", "
" ], "text/plain": [ " words label\n", "0 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... 1\n", "1 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... 0\n", "2 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... 1\n", "3 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... 1\n", "4 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... 1" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df_train = pd.DataFrame(train_data, columns=[\"words\", \"label\"])\n", "df_test = pd.DataFrame(test_data, columns=[\"words\", \"label\"])\n", "df_train.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 特征编码(Tf-Idf模型)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "vectorizer = TfidfVectorizer(token_pattern='\\[?\\w+\\]?', \n", " stop_words=stopwords)\n", "X_train = vectorizer.fit_transform(df_train[\"words\"])\n", "y_train = df_train[\"label\"]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "X_test = vectorizer.transform(df_test[\"words\"])\n", "y_test = df_test[\"label\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 训练模型&测试" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SVC()" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import svm\n", "\n", "clf = svm.SVC()\n", "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "# 在测试集上用模型预测结果\n", "y_pred = clf.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.82 0.69 0.75 155\n", " 1 0.87 0.93 0.90 345\n", "\n", " accuracy 0.86 500\n", " macro avg 0.84 0.81 0.82 500\n", "weighted avg 0.85 0.86 0.85 500\n", "\n", "准确率: 0.856\n" ] } ], "source": [ "# 测试集效果检验\n", "from sklearn import metrics\n", "\n", "print(metrics.classification_report(y_test, y_pred))\n", "print(\"准确率:\", metrics.accuracy_score(y_test, y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 手动输入句子,判断情感倾向" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "from utils import processing\n", "\n", "strs = [\"只要流过的汗与泪都能化作往后的明亮,就值得你为自己喝彩\", \"烦死了!为什么周末还要加班[愤怒]\"]\n", "words = [processing(s) for s in strs]\n", "vec = vectorizer.transform(words)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 0])" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output = clf.predict(vec)\n", "output" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }