{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### 加载数据集" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from utils import load_corpus, stopwords\n", "\n", "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n", "TEST_PATH = \"./data/weibo2018/test.txt\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n", "Dumping model to file cache /var/folders/rt/khjltk4j6n78x9x3f20hdr6m0000gp/T/jieba.cache\n", "Loading model cost 1.013 seconds.\n", "Prefix dict has been built successfully.\n" ] } ], "source": [ "# 分别加载训练集和测试集\n", "train_data = load_corpus(TRAIN_PATH)\n", "test_data = load_corpus(TEST_PATH)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordslabel
0书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹...1
1这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑...0
2中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ...1
3看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十...1
4汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历...1
\n", "
" ], "text/plain": [ " words label\n", "0 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... 1\n", "1 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... 0\n", "2 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... 1\n", "3 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... 1\n", "4 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... 1" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df_train = pd.DataFrame(train_data, columns=[\"words\", \"label\"])\n", "df_test = pd.DataFrame(test_data, columns=[\"words\", \"label\"])\n", "df_train.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 特征编码" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/albertdxq/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:383: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['元', '吨', '数', '末'] not in stop_words.\n", " warnings.warn('Your stop_words may be inconsistent with '\n" ] } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "vectorizer = CountVectorizer(token_pattern='\\[?\\w+\\]?', \n", " stop_words=stopwords,\n", " max_features=2000)\n", "X_train = vectorizer.fit_transform(df_train[\"words\"])\n", "y_train = df_train[\"label\"]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "X_test = vectorizer.transform(df_test[\"words\"])\n", "y_test = df_test[\"label\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 训练模型&测试" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import xgboost as xgb\n", "\n", "param = {\n", " 'booster':'gbtree',\n", " 'max_depth': 6, \n", " 'scale_pos_weight': 0.5,\n", " 'colsample_bytree': 0.8,\n", " 'objective': 'binary:logistic',\n", " 'eval_metric': 'error',\n", " 'eta': 0.3,\n", " 'nthread': 10,\n", "}\n", "dmatrix = xgb.DMatrix(X_train, label=y_train)\n", "model = xgb.train(param, dmatrix, num_boost_round=200)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# 在测试集上用模型预测结果\n", "dmatrix = xgb.DMatrix(X_test)\n", "y_pred = model.predict(dmatrix)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.75 0.82 0.78 155\n", " 1 0.92 0.88 0.90 345\n", "\n", " accuracy 0.86 500\n", " macro avg 0.83 0.85 0.84 500\n", "weighted avg 0.86 0.86 0.86 500\n", "\n", "准确率: 0.86\n", "AUC: 0.9040205703599813\n" ] } ], "source": [ "# 测试集效果检验\n", "from sklearn import metrics\n", "\n", "auc_score = metrics.roc_auc_score(y_test, y_pred) # 先计算AUC\n", "y_pred = list(map(lambda x:1 if x > 0.5 else 0, y_pred)) # 二值化\n", "print(metrics.classification_report(y_test, y_pred))\n", "print(\"准确率:\", metrics.accuracy_score(y_test, y_pred))\n", "print(\"AUC:\", auc_score)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 手动输入句子,判断情感倾向(1正/0负)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from utils import processing\n", "\n", "strs = [\"哈哈哈哈哈笑死我了\", \"我也是有脾气的!\"]\n", "words = [processing(s) for s in strs]\n", "vec = vectorizer.transform(words)\n", "dmatrix = xgb.DMatrix(vec)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.8683682, 0.3285784], dtype=float32)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output = model.predict(dmatrix)\n", "output" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }