diff --git a/.gitignore b/.gitignore
index a63202a..078911e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,7 +160,7 @@ Desktop.ini
# */*/models/
# PyTorch 模型文件
-*.pth
+# *.pth
*.pt
*.bin
*.ckpt
@@ -183,6 +183,7 @@ WeiboSentiment_Finetuned/BertChinese-Lora/models/
WeiboSentiment_LLM/models/
WeiboSentiment_Finetuned/BertChinese-Lora/model/
WeiboMultilingualSentiment/model/
+WeiboSentiment_MachineLearning/model/chinese_wwm_pytorch/
# LoRA 和 Adapter 权重
*/adapter_model.safetensors
@@ -210,8 +211,8 @@ checkpoints/
# */waimai_10k.csv
# 其他数据文件
-*.pkl
-*.pickle
+# *.pkl
+# *.pickle
*.json.gz
*.jsonl
*.parquet
diff --git a/WeiboSentiment_MachineLearning/1.bayes.ipynb b/WeiboSentiment_MachineLearning/1.bayes.ipynb
deleted file mode 100644
index c6b1725..0000000
--- a/WeiboSentiment_MachineLearning/1.bayes.ipynb
+++ /dev/null
@@ -1,306 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 加载数据集"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from utils import load_corpus, stopwords\n",
- "\n",
- "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n",
- "TEST_PATH = \"./data/weibo2018/test.txt\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Building prefix dict from the default dictionary ...\n",
- "Loading model from cache /var/folders/rt/khjltk4j6n78x9x3f20hdr6m0000gp/T/jieba.cache\n",
- "Loading model cost 1.023 seconds.\n",
- "Prefix dict has been built successfully.\n"
- ]
- }
- ],
- "source": [
- "# 分别加载训练集和测试集\n",
- "train_data = load_corpus(TRAIN_PATH)\n",
- "test_data = load_corpus(TEST_PATH)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " words | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " words label\n",
- "0 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... 1\n",
- "1 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... 0\n",
- "2 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... 1\n",
- "3 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... 1\n",
- "4 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... 1"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "df_train = pd.DataFrame(train_data, columns=[\"words\", \"label\"])\n",
- "df_test = pd.DataFrame(test_data, columns=[\"words\", \"label\"])\n",
- "df_train.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 特征编码(词袋模型)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/albertdxq/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:383: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['元', '吨', '数', '末'] not in stop_words.\n",
- " warnings.warn('Your stop_words may be inconsistent with '\n"
- ]
- }
- ],
- "source": [
- "from sklearn.feature_extraction.text import CountVectorizer\n",
- "\n",
- "vectorizer = CountVectorizer(token_pattern='\\[?\\w+\\]?', \n",
- " stop_words=stopwords)\n",
- "X_train = vectorizer.fit_transform(df_train[\"words\"])\n",
- "y_train = df_train[\"label\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_test = vectorizer.transform(df_test[\"words\"])\n",
- "y_test = df_test[\"label\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 训练模型&测试"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "MultinomialNB()"
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.naive_bayes import MultinomialNB\n",
- "\n",
- "clf = MultinomialNB()\n",
- "clf.fit(X_train, y_train)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "metadata": {},
- "outputs": [],
- "source": [
- "# 在测试集上用模型预测结果\n",
- "y_pred = clf.predict(X_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " precision recall f1-score support\n",
- "\n",
- " 0 0.75 0.80 0.78 155\n",
- " 1 0.91 0.88 0.89 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.83 0.84 0.83 500\n",
- "weighted avg 0.86 0.86 0.86 500\n",
- "\n",
- "准确率: 0.856\n"
- ]
- }
- ],
- "source": [
- "# 测试集效果检验\n",
- "from sklearn import metrics\n",
- "\n",
- "print(metrics.classification_report(y_test, y_pred))\n",
- "print(\"准确率:\", metrics.accuracy_score(y_test, y_pred))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 手动输入句子,判断情感倾向"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "metadata": {},
- "outputs": [],
- "source": [
- "from utils import processing\n",
- "\n",
- "strs = [\"终于收获一个最好消息\", \"哭了, 今天怎么这么倒霉\"]\n",
- "words = [processing(s) for s in strs]\n",
- "vec = vectorizer.transform(words)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([1, 0])"
- ]
- },
- "execution_count": 67,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "output = clf.predict(vec)\n",
- "output"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/WeiboSentiment_MachineLearning/2.svm.ipynb b/WeiboSentiment_MachineLearning/2.svm.ipynb
deleted file mode 100644
index 27ed68f..0000000
--- a/WeiboSentiment_MachineLearning/2.svm.ipynb
+++ /dev/null
@@ -1,286 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 加载数据集"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from utils import load_corpus, stopwords\n",
- "\n",
- "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n",
- "TEST_PATH = \"./data/weibo2018/test.txt\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "# 分别加载训练集和测试集\n",
- "train_data = load_corpus(TRAIN_PATH)\n",
- "test_data = load_corpus(TEST_PATH)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " words | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " words label\n",
- "0 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... 1\n",
- "1 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... 0\n",
- "2 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... 1\n",
- "3 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... 1\n",
- "4 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... 1"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "df_train = pd.DataFrame(train_data, columns=[\"words\", \"label\"])\n",
- "df_test = pd.DataFrame(test_data, columns=[\"words\", \"label\"])\n",
- "df_train.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 特征编码(Tf-Idf模型)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
- "\n",
- "vectorizer = TfidfVectorizer(token_pattern='\\[?\\w+\\]?', \n",
- " stop_words=stopwords)\n",
- "X_train = vectorizer.fit_transform(df_train[\"words\"])\n",
- "y_train = df_train[\"label\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_test = vectorizer.transform(df_test[\"words\"])\n",
- "y_test = df_test[\"label\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 训练模型&测试"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "SVC()"
- ]
- },
- "execution_count": 58,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn import svm\n",
- "\n",
- "clf = svm.SVC()\n",
- "clf.fit(X_train, y_train)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {},
- "outputs": [],
- "source": [
- "# 在测试集上用模型预测结果\n",
- "y_pred = clf.predict(X_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " precision recall f1-score support\n",
- "\n",
- " 0 0.82 0.69 0.75 155\n",
- " 1 0.87 0.93 0.90 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.84 0.81 0.82 500\n",
- "weighted avg 0.85 0.86 0.85 500\n",
- "\n",
- "准确率: 0.856\n"
- ]
- }
- ],
- "source": [
- "# 测试集效果检验\n",
- "from sklearn import metrics\n",
- "\n",
- "print(metrics.classification_report(y_test, y_pred))\n",
- "print(\"准确率:\", metrics.accuracy_score(y_test, y_pred))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 手动输入句子,判断情感倾向"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [],
- "source": [
- "from utils import processing\n",
- "\n",
- "strs = [\"只要流过的汗与泪都能化作往后的明亮,就值得你为自己喝彩\", \"烦死了!为什么周末还要加班[愤怒]\"]\n",
- "words = [processing(s) for s in strs]\n",
- "vec = vectorizer.transform(words)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([0, 0])"
- ]
- },
- "execution_count": 50,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "output = clf.predict(vec)\n",
- "output"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/WeiboSentiment_MachineLearning/3.xgboost.ipynb b/WeiboSentiment_MachineLearning/3.xgboost.ipynb
deleted file mode 100644
index 0efdd70..0000000
--- a/WeiboSentiment_MachineLearning/3.xgboost.ipynb
+++ /dev/null
@@ -1,314 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 加载数据集"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from utils import load_corpus, stopwords\n",
- "\n",
- "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n",
- "TEST_PATH = \"./data/weibo2018/test.txt\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Building prefix dict from the default dictionary ...\n",
- "Dumping model to file cache /var/folders/rt/khjltk4j6n78x9x3f20hdr6m0000gp/T/jieba.cache\n",
- "Loading model cost 1.013 seconds.\n",
- "Prefix dict has been built successfully.\n"
- ]
- }
- ],
- "source": [
- "# 分别加载训练集和测试集\n",
- "train_data = load_corpus(TRAIN_PATH)\n",
- "test_data = load_corpus(TEST_PATH)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " words | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " words label\n",
- "0 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... 1\n",
- "1 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... 0\n",
- "2 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... 1\n",
- "3 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... 1\n",
- "4 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... 1"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "df_train = pd.DataFrame(train_data, columns=[\"words\", \"label\"])\n",
- "df_test = pd.DataFrame(test_data, columns=[\"words\", \"label\"])\n",
- "df_train.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 特征编码"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/albertdxq/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:383: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['元', '吨', '数', '末'] not in stop_words.\n",
- " warnings.warn('Your stop_words may be inconsistent with '\n"
- ]
- }
- ],
- "source": [
- "from sklearn.feature_extraction.text import CountVectorizer\n",
- "\n",
- "vectorizer = CountVectorizer(token_pattern='\\[?\\w+\\]?', \n",
- " stop_words=stopwords,\n",
- " max_features=2000)\n",
- "X_train = vectorizer.fit_transform(df_train[\"words\"])\n",
- "y_train = df_train[\"label\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_test = vectorizer.transform(df_test[\"words\"])\n",
- "y_test = df_test[\"label\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 训练模型&测试"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "import xgboost as xgb\n",
- "\n",
- "param = {\n",
- " 'booster':'gbtree',\n",
- " 'max_depth': 6, \n",
- " 'scale_pos_weight': 0.5,\n",
- " 'colsample_bytree': 0.8,\n",
- " 'objective': 'binary:logistic',\n",
- " 'eval_metric': 'error',\n",
- " 'eta': 0.3,\n",
- " 'nthread': 10,\n",
- "}\n",
- "dmatrix = xgb.DMatrix(X_train, label=y_train)\n",
- "model = xgb.train(param, dmatrix, num_boost_round=200)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "# 在测试集上用模型预测结果\n",
- "dmatrix = xgb.DMatrix(X_test)\n",
- "y_pred = model.predict(dmatrix)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " precision recall f1-score support\n",
- "\n",
- " 0 0.75 0.82 0.78 155\n",
- " 1 0.92 0.88 0.90 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.83 0.85 0.84 500\n",
- "weighted avg 0.86 0.86 0.86 500\n",
- "\n",
- "准确率: 0.86\n",
- "AUC: 0.9040205703599813\n"
- ]
- }
- ],
- "source": [
- "# 测试集效果检验\n",
- "from sklearn import metrics\n",
- "\n",
- "auc_score = metrics.roc_auc_score(y_test, y_pred) # 先计算AUC\n",
- "y_pred = list(map(lambda x:1 if x > 0.5 else 0, y_pred)) # 二值化\n",
- "print(metrics.classification_report(y_test, y_pred))\n",
- "print(\"准确率:\", metrics.accuracy_score(y_test, y_pred))\n",
- "print(\"AUC:\", auc_score)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 手动输入句子,判断情感倾向(1正/0负)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "from utils import processing\n",
- "\n",
- "strs = [\"哈哈哈哈哈笑死我了\", \"我也是有脾气的!\"]\n",
- "words = [processing(s) for s in strs]\n",
- "vec = vectorizer.transform(words)\n",
- "dmatrix = xgb.DMatrix(vec)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([0.8683682, 0.3285784], dtype=float32)"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "output = model.predict(dmatrix)\n",
- "output"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/WeiboSentiment_MachineLearning/4.lstm.ipynb b/WeiboSentiment_MachineLearning/4.lstm.ipynb
deleted file mode 100644
index 203be55..0000000
--- a/WeiboSentiment_MachineLearning/4.lstm.ipynb
+++ /dev/null
@@ -1,717 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 40
- },
- "source": [
- "### 加载数据集"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "cell_id": 1
- },
- "outputs": [],
- "source": [
- "from utils import load_corpus, stopwords\n",
- "\n",
- "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n",
- "TEST_PATH = \"./data/weibo2018/test.txt\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "cell_id": 2
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Building prefix dict from the default dictionary ...\n",
- "Loading model from cache /tmp/jieba.cache\n",
- "Loading model cost 0.826 seconds.\n",
- "Prefix dict has been built successfully.\n"
- ]
- }
- ],
- "source": [
- "# 分别加载训练集和测试集\n",
- "train_data = load_corpus(TRAIN_PATH)\n",
- "test_data = load_corpus(TEST_PATH)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "cell_id": 3
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " text | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " text label\n",
- "0 书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹... 1\n",
- "1 这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑... 0\n",
- "2 中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ... 1\n",
- "3 看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十... 1\n",
- "4 汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历... 1"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "df_train = pd.DataFrame(train_data, columns=[\"text\", \"label\"])\n",
- "df_test = pd.DataFrame(test_data, columns=[\"text\", \"label\"])\n",
- "df_train.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 42
- },
- "source": [
- "### 训练词向量"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "cell_id": 44
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 [书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...\n",
- "1 [这是, 英超, 被, 黑, 的, 最惨, 的, 一次, 二哈, 二哈, 十几年来, 中国,...\n",
- "2 [中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...\n",
- "3 [看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...\n",
- "4 [汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...\n",
- "Name: text, dtype: object"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# word2vec要求的输入格式: list(word)\n",
- "wv_input = df_train['text'].map(lambda s: s.split(\" \")) # [for w in s.split(\" \") if w not in stopwords]\n",
- "wv_input.head() "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "cell_id": 4
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/tiger/.local/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
- " warnings.warn(msg)\n"
- ]
- }
- ],
- "source": [
- "from gensim import models\n",
- "\n",
- "# Word2Vec\n",
- "word2vec = models.Word2Vec(wv_input, \n",
- " vector_size=64, # 词向量维度\n",
- " min_count=1, # 最小词频, 因为数据量较小, 这里卡1\n",
- " epochs=1000) # 迭代轮次"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 46
- },
- "source": [
- "查找近义词, 直观感受训练得到的word2vec效果"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "cell_id": 5,
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('我', 0.9441561102867126),\n",
- " ('自己', 0.8928312659263611),\n",
- " ('他', 0.8796129822731018),\n",
- " ('的', 0.8601957559585571),\n",
- " ('她', 0.855070948600769),\n",
- " ('人', 0.8349815607070923),\n",
- " ('都', 0.8168802261352539),\n",
- " ('了', 0.8017680644989014),\n",
- " ('就', 0.7990766763687134),\n",
- " ('也', 0.7883183360099792)]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "word2vec.wv.most_similar(\"你\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "cell_id": 38
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('哈哈哈', 0.6309624910354614),\n",
- " ('啦', 0.5457888841629028),\n",
- " ('可爱', 0.5375339984893799),\n",
- " ('了', 0.4885959327220917),\n",
- " ('本柔', 0.46517741680145264),\n",
- " ('笑', 0.4639575779438019),\n",
- " ('哈哈哈哈', 0.45851588249206543),\n",
- " ('心虚', 0.4576280415058136),\n",
- " ('又', 0.45520466566085815),\n",
- " ('呀', 0.4494859576225281)]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "word2vec.wv.most_similar(\"哈哈\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "cell_id": 39
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('难过', 0.724579393863678),\n",
- " ('哭', 0.6421604752540588),\n",
- " ('想', 0.6415957808494568),\n",
- " ('也', 0.6394745707511902),\n",
- " ('真的', 0.6263709664344788),\n",
- " ('我', 0.6136066317558289),\n",
- " ('都', 0.608888566493988),\n",
- " ('的', 0.6078368425369263),\n",
- " ('就', 0.5916700959205627),\n",
- " ('开心', 0.5899774432182312)]"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "word2vec.wv.most_similar(\"伤心\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 48
- },
- "source": [
- "### 神经网络"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "cell_id": 14
- },
- "outputs": [],
- "source": [
- "import torch\n",
- "from torch import nn\n",
- "from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence\n",
- "from torch.utils.data import Dataset, DataLoader\n",
- "\n",
- "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "cell_id": 19
- },
- "outputs": [],
- "source": [
- "# 超参数\n",
- "learning_rate = 5e-4\n",
- "input_size = 768\n",
- "num_epoches = 5\n",
- "batch_size = 100\n",
- "embed_size = 64\n",
- "hidden_size = 64\n",
- "num_layers = 2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "cell_id": 7
- },
- "outputs": [],
- "source": [
- "# 数据集\n",
- "class MyDataset(Dataset):\n",
- " def __init__(self, df):\n",
- " self.data = []\n",
- " self.label = df[\"label\"].tolist()\n",
- " for s in df[\"text\"].tolist():\n",
- " vectors = []\n",
- " for w in s.split(\" \"):\n",
- " if w in word2vec.wv.key_to_index:\n",
- " vectors.append(word2vec.wv[w]) # 将每个词替换为对应的词向量\n",
- " vectors = torch.Tensor(vectors)\n",
- " self.data.append(vectors)\n",
- " \n",
- " def __getitem__(self, index):\n",
- " data = self.data[index]\n",
- " label = self.label[index]\n",
- " return data, label\n",
- "\n",
- " def __len__(self):\n",
- " return len(self.label)\n",
- "\n",
- "def collate_fn(data):\n",
- " \"\"\"\n",
- " :param data: 第0维:data,第1维:label\n",
- " :return: 序列化的data、记录实际长度的序列、以及label列表\n",
- " \"\"\"\n",
- " data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列\n",
- " data_length = [len(sq[0]) for sq in data]\n",
- " x = [i[0] for i in data]\n",
- " y = [i[1] for i in data]\n",
- " data = pad_sequence(x, batch_first=True, padding_value=0) # 用RNN处理变长序列的必要操作\n",
- " return data, torch.tensor(y, dtype=torch.float32), data_length\n",
- "\n",
- "\n",
- "# 训练集\n",
- "train_data = MyDataset(df_train)\n",
- "train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)\n",
- "\n",
- "# 测试集\n",
- "test_data = MyDataset(df_test)\n",
- "test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "cell_id": 11
- },
- "outputs": [],
- "source": [
- "# 网络结构\n",
- "class LSTM(nn.Module):\n",
- " def __init__(self, input_size, hidden_size, num_layers):\n",
- " super(LSTM, self).__init__()\n",
- " self.hidden_size = hidden_size\n",
- " self.num_layers = num_layers\n",
- " self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)\n",
- " self.fc = nn.Linear(hidden_size * 2, 1) # 双向, 输出维度要*2\n",
- " self.sigmoid = nn.Sigmoid()\n",
- "\n",
- " def forward(self, x, lengths):\n",
- " h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device) # 双向, 第一个维度要*2\n",
- " c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)\n",
- " \n",
- " packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)\n",
- " packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))\n",
- "\n",
- " lstm_out = torch.cat([h_n[-2], h_n[-1]], 1) # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出\n",
- " out = self.fc(lstm_out)\n",
- " out = self.sigmoid(out)\n",
- " return out\n",
- "\n",
- "lstm = LSTM(embed_size, hidden_size, num_layers)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "cell_id": 26
- },
- "outputs": [],
- "source": [
- "from sklearn import metrics\n",
- "\n",
- "# 在测试集效果检验\n",
- "def test():\n",
- " y_pred, y_true = [], []\n",
- "\n",
- " with torch.no_grad():\n",
- " for x, labels, lengths in test_loader:\n",
- " x = x.to(device)\n",
- " outputs = lstm(x, lengths) # 前向传播\n",
- " outputs = outputs.view(-1) # 将输出展平\n",
- " y_pred.append(outputs)\n",
- " y_true.append(labels)\n",
- "\n",
- " y_prob = torch.cat(y_pred)\n",
- " y_true = torch.cat(y_true)\n",
- " y_pred = y_prob.clone()\n",
- " y_pred[y_pred > 0.5] = 1\n",
- " y_pred[y_pred <= 0.5] = 0\n",
- " \n",
- " print(metrics.classification_report(y_true, y_pred))\n",
- " print(\"准确率:\", metrics.accuracy_score(y_true, y_pred))\n",
- " print(\"AUC:\", metrics.roc_auc_score(y_true, y_prob) )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "cell_id": 32
- },
- "outputs": [],
- "source": [
- "# 定义损失函数和优化器\n",
- "criterion = nn.BCELoss()\n",
- "optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "cell_id": 33,
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "epoch:1, step:10, loss:0.689099133014679\n",
- "epoch:1, step:20, loss:0.6717442870140076\n",
- "epoch:1, step:30, loss:0.650161862373352\n",
- "epoch:1, step:40, loss:0.5935518741607666\n",
- "epoch:1, step:50, loss:0.4994719922542572\n",
- "epoch:1, step:60, loss:0.4774974286556244\n",
- "epoch:1, step:70, loss:0.482360303401947\n",
- "epoch:1, step:80, loss:0.44858306646347046\n",
- "epoch:1, step:90, loss:0.4513603746891022\n",
- "epoch:1, step:100, loss:0.4386572241783142\n",
- " precision recall f1-score support\n",
- "\n",
- " 0.0 0.75 0.80 0.78 155\n",
- " 1.0 0.91 0.88 0.89 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.83 0.84 0.83 500\n",
- "weighted avg 0.86 0.86 0.86 500\n",
- "\n",
- "准确率: 0.856\n",
- "AUC: 0.9141841982234689\n",
- "saved model: ./model/lstm_1.model\n",
- "epoch:2, step:10, loss:0.4317778944969177\n",
- "epoch:2, step:20, loss:0.41387200355529785\n",
- "epoch:2, step:30, loss:0.4237545430660248\n",
- "epoch:2, step:40, loss:0.364933043718338\n",
- "epoch:2, step:50, loss:0.37595903873443604\n",
- "epoch:2, step:60, loss:0.4067295491695404\n",
- "epoch:2, step:70, loss:0.41071224212646484\n",
- "epoch:2, step:80, loss:0.39134103059768677\n",
- "epoch:2, step:90, loss:0.37907883524894714\n",
- "epoch:2, step:100, loss:0.4322803020477295\n",
- " precision recall f1-score support\n",
- "\n",
- " 0.0 0.80 0.63 0.71 155\n",
- " 1.0 0.85 0.93 0.89 345\n",
- "\n",
- " accuracy 0.84 500\n",
- " macro avg 0.83 0.78 0.80 500\n",
- "weighted avg 0.83 0.84 0.83 500\n",
- "\n",
- "准确率: 0.838\n",
- "AUC: 0.9174193548387096\n",
- "saved model: ./model/lstm_2.model\n",
- "epoch:3, step:10, loss:0.37696003913879395\n",
- "epoch:3, step:20, loss:0.36385685205459595\n",
- "epoch:3, step:30, loss:0.3907310664653778\n",
- "epoch:3, step:40, loss:0.35576874017715454\n",
- "epoch:3, step:50, loss:0.36152324080467224\n",
- "epoch:3, step:60, loss:0.3620041608810425\n",
- "epoch:3, step:70, loss:0.32647013664245605\n",
- "epoch:3, step:80, loss:0.38903307914733887\n",
- "epoch:3, step:90, loss:0.34238141775131226\n",
- "epoch:3, step:100, loss:0.3952549397945404\n",
- " precision recall f1-score support\n",
- "\n",
- " 0.0 0.75 0.79 0.77 155\n",
- " 1.0 0.90 0.88 0.89 345\n",
- "\n",
- " accuracy 0.85 500\n",
- " macro avg 0.83 0.84 0.83 500\n",
- "weighted avg 0.86 0.85 0.85 500\n",
- "\n",
- "准确率: 0.854\n",
- "AUC: 0.9280411407199626\n",
- "saved model: ./model/lstm_3.model\n",
- "epoch:4, step:10, loss:0.34902292490005493\n",
- "epoch:4, step:20, loss:0.3277026116847992\n",
- "epoch:4, step:30, loss:0.32119297981262207\n",
- "epoch:4, step:40, loss:0.34501412510871887\n",
- "epoch:4, step:50, loss:0.3202686905860901\n",
- "epoch:4, step:60, loss:0.3599391579627991\n",
- "epoch:4, step:70, loss:0.2958642542362213\n",
- "epoch:4, step:80, loss:0.3152882158756256\n",
- "epoch:4, step:90, loss:0.3151417374610901\n",
- "epoch:4, step:100, loss:0.3314781188964844\n",
- " precision recall f1-score support\n",
- "\n",
- " 0.0 0.78 0.81 0.79 155\n",
- " 1.0 0.91 0.90 0.90 345\n",
- "\n",
- " accuracy 0.87 500\n",
- " macro avg 0.84 0.85 0.85 500\n",
- "weighted avg 0.87 0.87 0.87 500\n",
- "\n",
- "准确率: 0.868\n",
- "AUC: 0.9314258999532491\n",
- "saved model: ./model/lstm_4.model\n",
- "epoch:5, step:10, loss:0.2638005316257477\n",
- "epoch:5, step:20, loss:0.3028942048549652\n",
- "epoch:5, step:30, loss:0.2819410562515259\n",
- "epoch:5, step:40, loss:0.2857419550418854\n",
- "epoch:5, step:50, loss:0.3177730441093445\n",
- "epoch:5, step:60, loss:0.3140687346458435\n",
- "epoch:5, step:70, loss:0.32480892539024353\n",
- "epoch:5, step:80, loss:0.2964351177215576\n",
- "epoch:5, step:90, loss:0.27567631006240845\n",
- "epoch:5, step:100, loss:0.2848973870277405\n",
- " precision recall f1-score support\n",
- "\n",
- " 0.0 0.83 0.74 0.78 155\n",
- " 1.0 0.89 0.93 0.91 345\n",
- "\n",
- " accuracy 0.87 500\n",
- " macro avg 0.86 0.83 0.84 500\n",
- "weighted avg 0.87 0.87 0.87 500\n",
- "\n",
- "准确率: 0.87\n",
- "AUC: 0.9310892940626461\n",
- "saved model: ./model/lstm_5.model\n"
- ]
- }
- ],
- "source": [
- "# 迭代训练\n",
- "for epoch in range(num_epoches):\n",
- " total_loss = 0\n",
- " for i, (x, labels, lengths) in enumerate(train_loader):\n",
- " x = x.to(device)\n",
- " labels = labels.to(device)\n",
- " outputs = lstm(x, lengths) # 前向传播\n",
- " logits = outputs.view(-1) # 将输出展平\n",
- " loss = criterion(logits, labels) # loss计算\n",
- " total_loss += loss\n",
- " optimizer.zero_grad() # 梯度清零\n",
- " loss.backward(retain_graph=True) # 反向传播,计算梯度\n",
- " optimizer.step() # 梯度更新\n",
- " if (i+1) % 10 == 0:\n",
- " print(\"epoch:{}, step:{}, loss:{}\".format(epoch+1, i+1, total_loss/10))\n",
- " total_loss = 0\n",
- " \n",
- " # test\n",
- " test()\n",
- " \n",
- " # save model\n",
- " model_path = \"./model/lstm_{}.model\".format(epoch+1)\n",
- " torch.save(lstm, model_path)\n",
- " print(\"saved model: \", model_path)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 36
- },
- "source": [
- "### 手动输入句子,判断情感倾向(1正/0负)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "cell_id": 51
- },
- "outputs": [],
- "source": [
- "net = torch.load(\"./model/lstm_5.model\") # 训练过程中的巅峰时刻"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {
- "cell_id": 52
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "tensor([0.9657, 0.3921])"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from utils import processing\n",
- "\n",
- "strs = [\"我想说我会爱你多一点点\", \"日有所思梦感伤\"]\n",
- "\n",
- "data = []\n",
- "for s in strs:\n",
- " vectors = []\n",
- " for w in processing(s).split(\" \"):\n",
- " if w in word2vec.wv.key_to_index:\n",
- " vectors.append(word2vec.wv[w]) # 将每个词替换为对应的词向量\n",
- " vectors = torch.Tensor(vectors)\n",
- " data.append(vectors)\n",
- "x, _, lengths = collate_fn(list(zip(data, [-1] * len(strs))))\n",
- "with torch.no_grad():\n",
- " x = x.to(device)\n",
- " outputs = lstm(x, lengths) # 前向传播\n",
- " outputs = outputs.view(-1) # 将输出展平\n",
- "outputs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "cell_id": 54
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- },
- "max_cell_id": 55
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/WeiboSentiment_MachineLearning/5.bert.ipynb b/WeiboSentiment_MachineLearning/5.bert.ipynb
deleted file mode 100644
index 135cf56..0000000
--- a/WeiboSentiment_MachineLearning/5.bert.ipynb
+++ /dev/null
@@ -1,696 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 39
- },
- "source": [
- "### 加载数据集"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "cell_id": 1
- },
- "outputs": [],
- "source": [
- "from utils import load_corpus_bert\n",
- "\n",
- "TRAIN_PATH = \"./data/weibo2018/train.txt\"\n",
- "TEST_PATH = \"./data/weibo2018/test.txt\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "cell_id": 3
- },
- "outputs": [],
- "source": [
- "# 分别加载训练集和测试集\n",
- "train_data = load_corpus_bert(TRAIN_PATH)\n",
- "test_data = load_corpus_bert(TEST_PATH)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "cell_id": 4
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " text | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " “书中自有黄金屋,书中自有颜如玉”。沿着岁月的长河跋涉,或是风光旖旎,或是姹紫嫣红,万千... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 这是英超被黑的最惨的一次[二哈][二哈]十几年来,中国只有孙继海,董方卓,郑智,李铁登陆过英... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 中国远洋海运集团副总经理俞曾港4月21日在 上表示,中央企业“走出去”是要站在更高的平台参... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 看《流星花园》其实也还好啦,现在的观念以及时尚眼光都不一样了,或许十几年之后的人看我们的现在... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 汉武帝的罪己诏的真实性尽管存在着争议,然而“轮台罪己诏”作为中国历史上第一份皇帝自我批评的文... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " text label\n",
- "0 “书中自有黄金屋,书中自有颜如玉”。沿着岁月的长河跋涉,或是风光旖旎,或是姹紫嫣红,万千... 1\n",
- "1 这是英超被黑的最惨的一次[二哈][二哈]十几年来,中国只有孙继海,董方卓,郑智,李铁登陆过英... 0\n",
- "2 中国远洋海运集团副总经理俞曾港4月21日在 上表示,中央企业“走出去”是要站在更高的平台参... 1\n",
- "3 看《流星花园》其实也还好啦,现在的观念以及时尚眼光都不一样了,或许十几年之后的人看我们的现在... 1\n",
- "4 汉武帝的罪己诏的真实性尽管存在着争议,然而“轮台罪己诏”作为中国历史上第一份皇帝自我批评的文... 1"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "df_train = pd.DataFrame(train_data, columns=[\"text\", \"label\"])\n",
- "df_test = pd.DataFrame(test_data, columns=[\"text\", \"label\"])\n",
- "df_train.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 41
- },
- "source": [
- "### 加载Bert"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "cell_id": 5
- },
- "outputs": [],
- "source": [
- "import os\n",
- "from transformers import BertTokenizer, BertModel\n",
- "\n",
- "os.environ[\"KMP_DUPLICATE_LIB_OK\"] = \"TRUE\" # 在我的电脑上不加这一句, bert模型会报错\n",
- "MODEL_PATH = \"./model/chinese_wwm_pytorch\" # 下载地址见 https://github.com/ymcui/Chinese-BERT-wwm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "cell_id": 6
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Some weights of the model checkpoint at ./model/chinese_wwm_pytorch were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']\n",
- "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
- "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
- ]
- }
- ],
- "source": [
- "# 加载\n",
- "tokenizer = BertTokenizer.from_pretrained(MODEL_PATH) # 分词器\n",
- "bert = BertModel.from_pretrained(MODEL_PATH) # 模型"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 43
- },
- "source": [
- "### 神经网络"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "cell_id": 7
- },
- "outputs": [],
- "source": [
- "import torch\n",
- "from torch import nn\n",
- "from torch.utils.data import Dataset, DataLoader\n",
- "\n",
- "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "cell_id": 8
- },
- "outputs": [],
- "source": [
- "# 超参数\n",
- "learning_rate = 1e-3\n",
- "input_size = 768\n",
- "num_epoches = 10\n",
- "batch_size = 100\n",
- "decay_rate = 0.9"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "cell_id": 9
- },
- "outputs": [],
- "source": [
- "# 数据集\n",
- "class MyDataset(Dataset):\n",
- " def __init__(self, df):\n",
- " self.data = df[\"text\"].tolist()\n",
- " self.label = df[\"label\"].tolist()\n",
- "\n",
- " def __getitem__(self, index):\n",
- " data = self.data[index]\n",
- " label = self.label[index]\n",
- " return data, label\n",
- "\n",
- " def __len__(self):\n",
- " return len(self.label)\n",
- "\n",
- "# 训练集\n",
- "train_data = MyDataset(df_train)\n",
- "train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)\n",
- "\n",
- "# 测试集\n",
- "test_data = MyDataset(df_test)\n",
- "test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "cell_id": 10
- },
- "outputs": [],
- "source": [
- "# 网络结构\n",
- "class Net(nn.Module):\n",
- " def __init__(self, input_size):\n",
- " super(Net, self).__init__()\n",
- " self.fc = nn.Linear(input_size, 1)\n",
- " self.sigmoid = nn.Sigmoid()\n",
- "\n",
- " def forward(self, x):\n",
- " out = self.fc(x)\n",
- " out = self.sigmoid(out)\n",
- " return out\n",
- "\n",
- "net = Net(input_size).to(device)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "cell_id": 34
- },
- "outputs": [],
- "source": [
- "from sklearn import metrics\n",
- "\n",
- "# 测试集效果检验\n",
- "def test():\n",
- " y_pred, y_true = [], []\n",
- "\n",
- " with torch.no_grad():\n",
- " for words, labels in test_loader:\n",
- " tokens = tokenizer(words, padding=True)\n",
- " input_ids = torch.tensor(tokens[\"input_ids\"]).to(device)\n",
- " attention_mask = torch.tensor(tokens[\"attention_mask\"]).to(device)\n",
- " last_hidden_states = bert(input_ids, attention_mask=attention_mask)\n",
- " bert_output = last_hidden_states[0][:, 0]\n",
- " outputs = net(bert_output) # 前向传播\n",
- " outputs = outputs.view(-1) # 将输出展平\n",
- " y_pred.append(outputs)\n",
- " y_true.append(labels)\n",
- "\n",
- " y_prob = torch.cat(y_pred)\n",
- " y_true = torch.cat(y_true)\n",
- " y_pred = y_prob.clone()\n",
- " y_pred[y_pred > 0.5] = 1\n",
- " y_pred[y_pred <= 0.5] = 0\n",
- " \n",
- " print(metrics.classification_report(y_true, y_pred))\n",
- " print(\"准确率:\", metrics.accuracy_score(y_true, y_pred))\n",
- " print(\"AUC:\", metrics.roc_auc_score(y_true, y_prob) )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "cell_id": 11
- },
- "outputs": [],
- "source": [
- "# 定义损失函数和优化器\n",
- "criterion = nn.BCELoss()\n",
- "optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)\n",
- "scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=decay_rate)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "cell_id": 14,
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "epoch:1, step:10, loss:0.6710587739944458\n",
- "epoch:1, step:20, loss:0.6176288723945618\n",
- "epoch:1, step:30, loss:0.578593909740448\n",
- "epoch:1, step:40, loss:0.5502474308013916\n",
- "epoch:1, step:50, loss:0.5323082804679871\n",
- "epoch:1, step:60, loss:0.515110194683075\n",
- "epoch:1, step:70, loss:0.5127577185630798\n",
- "epoch:1, step:80, loss:0.48992329835891724\n",
- "epoch:1, step:90, loss:0.4868148863315582\n",
- "epoch:1, step:100, loss:0.49194520711898804\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.74 0.74 0.74 155\n",
- " 1 0.88 0.88 0.88 345\n",
- "\n",
- " accuracy 0.84 500\n",
- " macro avg 0.81 0.81 0.81 500\n",
- "weighted avg 0.84 0.84 0.84 500\n",
- "\n",
- "准确率: 0.84\n",
- "AUC: 0.9027582982702197\n",
- "saved model: ./model/bert_dnn_1.model\n",
- "epoch:2, step:10, loss:0.46188774704933167\n",
- "epoch:2, step:20, loss:0.4335215985774994\n",
- "epoch:2, step:30, loss:0.4540901184082031\n",
- "epoch:2, step:40, loss:0.4392821788787842\n",
- "epoch:2, step:50, loss:0.47116056084632874\n",
- "epoch:2, step:60, loss:0.4669877886772156\n",
- "epoch:2, step:70, loss:0.4401330053806305\n",
- "epoch:2, step:80, loss:0.4518135190010071\n",
- "epoch:2, step:90, loss:0.4567466676235199\n",
- "epoch:2, step:100, loss:0.4663034975528717\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.72 0.83 0.77 155\n",
- " 1 0.92 0.85 0.88 345\n",
- "\n",
- " accuracy 0.85 500\n",
- " macro avg 0.82 0.84 0.83 500\n",
- "weighted avg 0.86 0.85 0.85 500\n",
- "\n",
- "准确率: 0.846\n",
- "AUC: 0.9149322113136981\n",
- "saved model: ./model/bert_dnn_2.model\n",
- "epoch:3, step:10, loss:0.42892661690711975\n",
- "epoch:3, step:20, loss:0.4225884974002838\n",
- "epoch:3, step:30, loss:0.415252685546875\n",
- "epoch:3, step:40, loss:0.43130287528038025\n",
- "epoch:3, step:50, loss:0.42938193678855896\n",
- "epoch:3, step:60, loss:0.4340507388114929\n",
- "epoch:3, step:70, loss:0.4466826319694519\n",
- "epoch:3, step:80, loss:0.45244288444519043\n",
- "epoch:3, step:90, loss:0.41808539628982544\n",
- "epoch:3, step:100, loss:0.44330015778541565\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.73 0.83 0.77 155\n",
- " 1 0.92 0.86 0.89 345\n",
- "\n",
- " accuracy 0.85 500\n",
- " macro avg 0.82 0.84 0.83 500\n",
- "weighted avg 0.86 0.85 0.85 500\n",
- "\n",
- "准确率: 0.85\n",
- "AUC: 0.9206545114539505\n",
- "saved model: ./model/bert_dnn_3.model\n",
- "epoch:4, step:10, loss:0.39769938588142395\n",
- "epoch:4, step:20, loss:0.4465697407722473\n",
- "epoch:4, step:30, loss:0.4216257929801941\n",
- "epoch:4, step:40, loss:0.41328248381614685\n",
- "epoch:4, step:50, loss:0.41364049911499023\n",
- "epoch:4, step:60, loss:0.4332212507724762\n",
- "epoch:4, step:70, loss:0.4280005395412445\n",
- "epoch:4, step:80, loss:0.41606149077415466\n",
- "epoch:4, step:90, loss:0.43310579657554626\n",
- "epoch:4, step:100, loss:0.4076871871948242\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.76 0.80 0.78 155\n",
- " 1 0.91 0.88 0.90 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.83 0.84 0.84 500\n",
- "weighted avg 0.86 0.86 0.86 500\n",
- "\n",
- "准确率: 0.858\n",
- "AUC: 0.9222814399251986\n",
- "saved model: ./model/bert_dnn_4.model\n",
- "epoch:5, step:10, loss:0.39923620223999023\n",
- "epoch:5, step:20, loss:0.4110904633998871\n",
- "epoch:5, step:30, loss:0.4446052610874176\n",
- "epoch:5, step:40, loss:0.4050986170768738\n",
- "epoch:5, step:50, loss:0.41362982988357544\n",
- "epoch:5, step:60, loss:0.3961515724658966\n",
- "epoch:5, step:80, loss:0.43208274245262146\n",
- "epoch:5, step:90, loss:0.4123595356941223\n",
- "epoch:5, step:100, loss:0.4114747643470764\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.75 0.81 0.78 155\n",
- " 1 0.91 0.88 0.90 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.83 0.85 0.84 500\n",
- "weighted avg 0.86 0.86 0.86 500\n",
- "\n",
- "准确率: 0.86\n",
- "AUC: 0.9251238896680692\n",
- "saved model: ./model/bert_dnn_5.model\n",
- "epoch:6, step:10, loss:0.4047953188419342\n",
- "epoch:6, step:20, loss:0.41434162855148315\n",
- "epoch:6, step:30, loss:0.4052816927433014\n",
- "epoch:6, step:40, loss:0.3726503849029541\n",
- "epoch:6, step:50, loss:0.4252064824104309\n",
- "epoch:6, step:60, loss:0.411870539188385\n",
- "epoch:6, step:70, loss:0.43613123893737793\n",
- "epoch:6, step:80, loss:0.4038943350315094\n",
- "epoch:6, step:90, loss:0.40738430619239807\n",
- "epoch:6, step:100, loss:0.41697797179222107\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.76 0.82 0.79 155\n",
- " 1 0.92 0.88 0.90 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.84 0.85 0.84 500\n",
- "weighted avg 0.87 0.86 0.87 500\n",
- "\n",
- "准确率: 0.864\n",
- "AUC: 0.9266947171575501\n",
- "saved model: ./model/bert_dnn_6.model\n",
- "epoch:7, step:10, loss:0.4255238175392151\n",
- "epoch:7, step:20, loss:0.3951468765735626\n",
- "epoch:7, step:30, loss:0.41892367601394653\n",
- "epoch:7, step:40, loss:0.40587490797042847\n",
- "epoch:7, step:50, loss:0.3918803036212921\n",
- "epoch:7, step:60, loss:0.43665409088134766\n",
- "epoch:7, step:70, loss:0.4085603654384613\n",
- "epoch:7, step:80, loss:0.3877314627170563\n",
- "epoch:7, step:90, loss:0.3680875301361084\n",
- "epoch:7, step:100, loss:0.4211949408054352\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.74 0.85 0.79 155\n",
- " 1 0.93 0.87 0.90 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.83 0.86 0.84 500\n",
- "weighted avg 0.87 0.86 0.86 500\n",
- "\n",
- "准确率: 0.86\n",
- "AUC: 0.9282094436652641\n",
- "saved model: ./model/bert_dnn_7.model\n",
- "epoch:8, step:10, loss:0.3657851815223694\n",
- "epoch:8, step:20, loss:0.3944622576236725\n",
- "epoch:8, step:30, loss:0.40657711029052734\n",
- "epoch:8, step:40, loss:0.3935934901237488\n",
- "epoch:8, step:50, loss:0.4171984791755676\n",
- "epoch:8, step:60, loss:0.4169773459434509\n",
- "epoch:8, step:70, loss:0.4021885395050049\n",
- "epoch:8, step:80, loss:0.4106557369232178\n",
- "epoch:8, step:100, loss:0.4116268754005432\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.80 0.78 0.79 155\n",
- " 1 0.90 0.91 0.91 345\n",
- "\n",
- " accuracy 0.87 500\n",
- " macro avg 0.85 0.85 0.85 500\n",
- "weighted avg 0.87 0.87 0.87 500\n",
- "\n",
- "准确率: 0.87\n",
- "AUC: 0.9288078541374474\n",
- "saved model: ./model/bert_dnn_8.model\n",
- "epoch:9, step:10, loss:0.4415532052516937\n",
- "epoch:9, step:20, loss:0.4093624949455261\n",
- "epoch:9, step:30, loss:0.3825526833534241\n",
- "epoch:9, step:40, loss:0.3692132532596588\n",
- "epoch:9, step:50, loss:0.39409342408180237\n",
- "epoch:9, step:60, loss:0.40440621972084045\n",
- "epoch:9, step:70, loss:0.3859332203865051\n",
- "epoch:9, step:80, loss:0.40987101197242737\n",
- "epoch:9, step:90, loss:0.4061252176761627\n",
- "epoch:9, step:100, loss:0.4131951332092285\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.75 0.84 0.79 155\n",
- " 1 0.92 0.88 0.90 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.84 0.86 0.85 500\n",
- "weighted avg 0.87 0.86 0.87 500\n",
- "\n",
- "准确率: 0.864\n",
- "AUC: 0.9293501636278635\n",
- "saved model: ./model/bert_dnn_9.model\n",
- "epoch:10, step:10, loss:0.40611615777015686\n",
- "epoch:10, step:20, loss:0.42403316497802734\n",
- "epoch:10, step:30, loss:0.3972412943840027\n",
- "epoch:10, step:40, loss:0.4144269526004791\n",
- "epoch:10, step:50, loss:0.37967294454574585\n",
- "epoch:10, step:60, loss:0.3992181420326233\n",
- "epoch:10, step:70, loss:0.3896545469760895\n",
- "epoch:10, step:80, loss:0.39779797196388245\n",
- "epoch:10, step:90, loss:0.38316115736961365\n",
- "epoch:10, step:100, loss:0.4042983055114746\n",
- " precision recall f1-score support\n",
- "\n",
- " 0 0.76 0.82 0.79 155\n",
- " 1 0.92 0.88 0.90 345\n",
- "\n",
- " accuracy 0.86 500\n",
- " macro avg 0.84 0.85 0.84 500\n",
- "weighted avg 0.87 0.86 0.86 500\n",
- "\n",
- "准确率: 0.862\n",
- "AUC: 0.9303973819541842\n",
- "saved model: ./model/bert_dnn_10.model\n"
- ]
- }
- ],
- "source": [
- "# 迭代训练\n",
- "for epoch in range(num_epoches):\n",
- " total_loss = 0\n",
- " for i, (words, labels) in enumerate(train_loader):\n",
- " tokens = tokenizer(words, padding=True)\n",
- " input_ids = torch.tensor(tokens[\"input_ids\"]).to(device)\n",
- " attention_mask = torch.tensor(tokens[\"attention_mask\"]).to(device)\n",
- " labels = labels.float().to(device)\n",
- " with torch.no_grad():\n",
- " last_hidden_states = bert(input_ids, attention_mask=attention_mask)\n",
- " bert_output = last_hidden_states[0][:, 0]\n",
- " optimizer.zero_grad() # 梯度清零\n",
- " outputs = net(bert_output) # 前向传播\n",
- " logits = outputs.view(-1) # 将输出展平\n",
- " loss = criterion(logits, labels) # loss计算\n",
- " total_loss += loss\n",
- " loss.backward() # 反向传播,计算梯度\n",
- " optimizer.step() # 梯度更新\n",
- " if (i+1) % 10 == 0:\n",
- " print(\"epoch:{}, step:{}, loss:{}\".format(epoch+1, i+1, total_loss/10))\n",
- " total_loss = 0\n",
- " \n",
- " # learning_rate decay\n",
- " scheduler.step()\n",
- " \n",
- " # test\n",
- " test()\n",
- " \n",
- " # save model\n",
- " model_path = \"./model/bert_dnn_{}.model\".format(epoch+1)\n",
- " torch.save(net, model_path)\n",
- " print(\"saved model: \", model_path)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "cell_id": 23
- },
- "source": [
- "### 手动输入句子,判断情感倾向(1正/0负)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "cell_id": 38
- },
- "outputs": [],
- "source": [
- "net = torch.load(\"./model/bert_dnn_8.model\") # 训练过程中的巅峰时刻"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "cell_id": 37
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "tensor([[0.9007],\n",
- " [0.2211]], grad_fn=)"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "s = [\"华丽繁荣的城市、充满回忆的小镇、郁郁葱葱的山谷...\", \"突然就觉得人间不值得\"]\n",
- "tokens = tokenizer(s, padding=True)\n",
- "input_ids = torch.tensor(tokens[\"input_ids\"])\n",
- "attention_mask = torch.tensor(tokens[\"attention_mask\"])\n",
- "last_hidden_states = bert(input_ids, attention_mask=attention_mask)\n",
- "bert_output = last_hidden_states[0][:, 0]\n",
- "outputs = net(bert_output)\n",
- "outputs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {
- "cell_id": 27,
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "tensor([[0.9735],\n",
- " [0.9882]], grad_fn=)"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "s = [\"今天天气真好\", \"今天天气特别特别棒\"]\n",
- "tokens = tokenizer(s, padding=True)\n",
- "input_ids = torch.tensor(tokens[\"input_ids\"])\n",
- "attention_mask = torch.tensor(tokens[\"attention_mask\"])\n",
- "last_hidden_states = bert(input_ids, attention_mask=attention_mask)\n",
- "bert_output = last_hidden_states[0][:, 0]\n",
- "outputs = net(bert_output)\n",
- "outputs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "cell_id": 32
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- },
- "max_cell_id": 45
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/WeiboSentiment_MachineLearning/README.md b/WeiboSentiment_MachineLearning/README.md
index 4f1790a..bece91f 100644
--- a/WeiboSentiment_MachineLearning/README.md
+++ b/WeiboSentiment_MachineLearning/README.md
@@ -1,32 +1,107 @@
-# WeiboSentiment
-用各种机器学习对中文微博进行情感分析
-语料来源: https://github.com/dengxiuqi/weibo2018
----
-##### "微博情感分析"是我本科的毕业设计, 也是我入门NLP的项目, 就把它发出来供大家交流。
-##### 2021.06.07更新: 之前的版本写得比较随意, 没想到star破百了, 私下也有一些刚入门NLP的同学因为这个项目联系我, 就更新一下这个项目吧
-* 重构项目架构和代码, 提高可读性
-* 每个文件中的特征、数据处理方法与模型细节都尽可能避免重复, 以给各位同学提供更多的参考
-* 神经网络结构换成了pytorch, 需要`tensorflow 1.0`代码的同学请回退至`445998`版本。
-* 新增了`Bert`模型
-* 由于gensim新老版本很多语法不兼容, 将gensim更新为4.0版本
-----
-#### 项目说明
-* 训练集10000条语料, 测试集500条语料
-* 使用朴素贝叶斯、SVM、XGBoost、LSTM和Bert, 等多种模型搭建并训练二分类模型
-* 前3个模型都采用端到端的训练方法
-* LSTM先预训练得到Word2Vec词向量, 在训练神经网络
-* `Bert`使用的是哈工大的预训练模型, 用Bert的`[CLS]`位输出在一个下游网络上进行finetune。预训练模型需要自行下载:
- * github下载地址: https://github.com/ymcui/Chinese-BERT-wwm
- * baidu网盘: https://pan.baidu.com/s/16z-ybrqT6wLdy_mLHtywSw 密码: djkj
- * 下载后将文件夹放在`./model`文件夹下, 并将`bert_config.json`改名为`config.json`
----
-#### 实验结果
-各种分类器在测试集上的测试结果
+# 微博情感分析 - 传统机器学习方法
-|模型|准确率|AUC|
-| :---: | :---: | :---: |
-|1.bayes|0.856| - |
-|2.svm|0.856| - |
-|3.xgboost|0.86| 0.904 |
-|4.lstm|0.87| 0.931 |
-|5.bert|0.87| 0.929 |
+## 项目介绍
+
+本项目使用5种传统机器学习方法对中文微博进行情感二分类(正面/负面):
+
+- **朴素贝叶斯**: 基于词袋模型的概率分类
+- **SVM**: 基于TF-IDF特征的支持向量机
+- **XGBoost**: 梯度提升决策树
+- **LSTM**: 循环神经网络 + Word2Vec词向量
+- **BERT+分类头**: 预训练语言模型接分类器(我认为也属于传统ML范畴)
+
+## 模型性能
+
+在微博情感数据集上的表现(训练集10000条,测试集500条):
+
+| 模型 | 准确率 | AUC | 特点 |
+|------|--------|-----|------|
+| 朴素贝叶斯 | 85.6% | - | 速度快,内存占用小 |
+| SVM | 85.6% | - | 泛化能力好 |
+| XGBoost | 86.0% | 90.4% | 性能稳定,支持特征重要性 |
+| LSTM | 87.0% | 93.1% | 理解序列信息和上下文 |
+| BERT+分类头 | 87.0% | 92.9% | 强大的语义理解能力 |
+
+## 环境配置
+
+```bash
+pip install -r requirements.txt
+```
+
+数据文件结构:
+```
+data/
+├── weibo2018/
+│ ├── train.txt
+│ └── test.txt
+└── stopwords.txt
+```
+
+## 训练模型(后面可以不接参数直接运行)
+
+### 朴素贝叶斯
+```bash
+python bayes_train.py
+```
+
+### SVM
+```bash
+python svm_train.py --kernel rbf --C 1.0
+```
+
+### XGBoost
+```bash
+python xgboost_train.py --max_depth 6 --eta 0.3 --num_boost_round 200
+```
+
+### LSTM
+```bash
+python lstm_train.py --epochs 5 --batch_size 100 --hidden_size 64
+```
+
+### BERT
+```bash
+python bert_train.py --epochs 10 --batch_size 100 --learning_rate 1e-3
+```
+
+注:BERT模型会自动下载中文预训练模型(bert-base-chinese)
+
+## 使用预测
+
+### 交互式预测(推荐)
+```bash
+python predict.py
+```
+
+### 命令行预测
+```bash
+# 单模型预测
+python predict.py --model_type bert --text "今天天气真好,心情很棒"
+
+# 多模型集成预测
+python predict.py --ensemble --text "这部电影太无聊了"
+```
+
+## 文件结构
+
+```
+WeiboSentiment_MachineLearning/
+├── bayes_train.py # 朴素贝叶斯训练
+├── svm_train.py # SVM训练
+├── xgboost_train.py # XGBoost训练
+├── lstm_train.py # LSTM训练
+├── bert_train.py # BERT训练
+├── predict.py # 统一预测程序
+├── base_model.py # 基础模型类
+├── utils.py # 工具函数
+├── requirements.txt # 依赖包
+├── model/ # 模型保存目录
+└── data/ # 数据目录
+```
+
+## 注意事项
+
+1. **BERT模型**首次运行会自动下载预训练模型(约400MB)
+2. **LSTM模型**训练时间较长,建议使用GPU
+3. **模型保存**在 `model/` 目录下,确保有足够磁盘空间
+4. **内存需求**BERT > LSTM > XGBoost > SVM > 朴素贝叶斯
diff --git a/WeiboSentiment_MachineLearning/base_model.py b/WeiboSentiment_MachineLearning/base_model.py
new file mode 100644
index 0000000..09da6cf
--- /dev/null
+++ b/WeiboSentiment_MachineLearning/base_model.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+"""
+基础模型类,为所有情感分析模型提供统一接口
+"""
+import os
+import pickle
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Dict, Any
+import pandas as pd
+from sklearn.metrics import accuracy_score, f1_score, classification_report
+from utils import load_corpus
+
+
+class BaseModel(ABC):
+ """情感分析模型基类"""
+
+ def __init__(self, model_name: str):
+ self.model_name = model_name
+ self.model = None
+ self.vectorizer = None
+ self.is_trained = False
+
+ @abstractmethod
+ def train(self, train_data: List[Tuple[str, int]], **kwargs) -> None:
+ """训练模型"""
+ pass
+
+ @abstractmethod
+ def predict(self, texts: List[str]) -> List[int]:
+ """预测文本情感"""
+ pass
+
+ def predict_single(self, text: str) -> Tuple[int, float]:
+ """预测单条文本的情感
+
+ Args:
+ text: 待预测文本
+
+ Returns:
+ (predicted_label, confidence)
+ """
+ predictions = self.predict([text])
+ return predictions[0], 0.0 # 默认置信度为0
+
+ def evaluate(self, test_data: List[Tuple[str, int]]) -> Dict[str, float]:
+ """评估模型性能"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ texts = [item[0] for item in test_data]
+ labels = [item[1] for item in test_data]
+
+ predictions = self.predict(texts)
+
+ accuracy = accuracy_score(labels, predictions)
+ f1 = f1_score(labels, predictions, average='weighted')
+
+ print(f"\n{self.model_name} 模型评估结果:")
+ print(f"准确率: {accuracy:.4f}")
+ print(f"F1分数: {f1:.4f}")
+ print("\n详细报告:")
+ print(classification_report(labels, predictions))
+
+ return {
+ 'accuracy': accuracy,
+ 'f1_score': f1,
+ 'classification_report': classification_report(labels, predictions)
+ }
+
+ def save_model(self, model_path: str = None) -> None:
+ """保存模型到文件"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,无法保存")
+
+ if model_path is None:
+ model_path = f"model/{self.model_name}_model.pkl"
+
+ # 创建保存目录
+ os.makedirs(os.path.dirname(model_path), exist_ok=True)
+
+ # 保存模型数据
+ model_data = {
+ 'model': self.model,
+ 'vectorizer': self.vectorizer,
+ 'model_name': self.model_name,
+ 'is_trained': self.is_trained
+ }
+
+ with open(model_path, 'wb') as f:
+ pickle.dump(model_data, f)
+
+ print(f"模型已保存到: {model_path}")
+
+ def load_model(self, model_path: str) -> None:
+ """从文件加载模型"""
+ if not os.path.exists(model_path):
+ raise FileNotFoundError(f"模型文件不存在: {model_path}")
+
+ with open(model_path, 'rb') as f:
+ model_data = pickle.load(f)
+
+ self.model = model_data['model']
+ self.vectorizer = model_data.get('vectorizer')
+ self.model_name = model_data['model_name']
+ self.is_trained = model_data['is_trained']
+
+ print(f"已加载模型: {model_path}")
+
+ @staticmethod
+ def load_data(train_path: str, test_path: str) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
+ """加载训练和测试数据"""
+ print("加载训练数据...")
+ train_data = load_corpus(train_path)
+ print(f"训练数据量: {len(train_data)}")
+
+ print("加载测试数据...")
+ test_data = load_corpus(test_path)
+ print(f"测试数据量: {len(test_data)}")
+
+ return train_data, test_data
\ No newline at end of file
diff --git a/WeiboSentiment_MachineLearning/bayes_train.py b/WeiboSentiment_MachineLearning/bayes_train.py
new file mode 100644
index 0000000..7ef3836
--- /dev/null
+++ b/WeiboSentiment_MachineLearning/bayes_train.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+"""
+朴素贝叶斯情感分析模型训练脚本
+"""
+import argparse
+import pandas as pd
+from typing import List, Tuple
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.metrics import accuracy_score, f1_score
+
+from base_model import BaseModel
+from utils import stopwords
+
+
+class BayesModel(BaseModel):
+ """朴素贝叶斯情感分析模型"""
+
+ def __init__(self):
+ super().__init__("Bayes")
+
+ def train(self, train_data: List[Tuple[str, int]], **kwargs) -> None:
+ """训练朴素贝叶斯模型
+
+ Args:
+ train_data: 训练数据,格式为[(text, label), ...]
+ **kwargs: 其他参数
+ """
+ print(f"开始训练 {self.model_name} 模型...")
+
+ # 准备数据
+ df_train = pd.DataFrame(train_data, columns=["words", "label"])
+
+ # 特征编码(词袋模型)
+ print("构建词袋模型...")
+ self.vectorizer = CountVectorizer(
+ token_pattern=r'\[?\w+\]?',
+ stop_words=stopwords
+ )
+
+ X_train = self.vectorizer.fit_transform(df_train["words"])
+ y_train = df_train["label"]
+
+ print(f"特征维度: {X_train.shape[1]}")
+
+ # 训练模型
+ print("训练朴素贝叶斯分类器...")
+ self.model = MultinomialNB()
+ self.model.fit(X_train, y_train)
+
+ self.is_trained = True
+ print(f"{self.model_name} 模型训练完成!")
+
+ def predict(self, texts: List[str]) -> List[int]:
+ """预测文本情感
+
+ Args:
+ texts: 待预测文本列表
+
+ Returns:
+ 预测结果列表
+ """
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ # 特征转换
+ X = self.vectorizer.transform(texts)
+
+ # 预测
+ predictions = self.model.predict(X)
+
+ return predictions.tolist()
+
+ def predict_single(self, text: str) -> Tuple[int, float]:
+ """预测单条文本的情感
+
+ Args:
+ text: 待预测文本
+
+ Returns:
+ (predicted_label, confidence)
+ """
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ # 特征转换
+ X = self.vectorizer.transform([text])
+
+ # 预测
+ prediction = self.model.predict(X)[0]
+ probabilities = self.model.predict_proba(X)[0]
+ confidence = max(probabilities)
+
+ return int(prediction), float(confidence)
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(description='朴素贝叶斯情感分析模型训练')
+ parser.add_argument('--train_path', type=str, default='./data/weibo2018/train.txt',
+ help='训练数据路径')
+ parser.add_argument('--test_path', type=str, default='./data/weibo2018/test.txt',
+ help='测试数据路径')
+ parser.add_argument('--model_path', type=str, default='./model/bayes_model.pkl',
+ help='模型保存路径')
+ parser.add_argument('--eval_only', action='store_true',
+ help='仅评估已有模型,不进行训练')
+
+ args = parser.parse_args()
+
+ # 创建模型
+ model = BayesModel()
+
+ if args.eval_only:
+ # 仅评估模式
+ print("评估模式:加载已有模型进行评估")
+ model.load_model(args.model_path)
+
+ # 加载测试数据
+ _, test_data = BaseModel.load_data(args.train_path, args.test_path)
+
+ # 评估模型
+ model.evaluate(test_data)
+ else:
+ # 训练模式
+ # 加载数据
+ train_data, test_data = BaseModel.load_data(args.train_path, args.test_path)
+
+ # 训练模型
+ model.train(train_data)
+
+ # 评估模型
+ model.evaluate(test_data)
+
+ # 保存模型
+ model.save_model(args.model_path)
+
+ # 示例预测
+ print("\n示例预测:")
+ test_texts = [
+ "今天天气真好,心情很棒",
+ "这部电影太无聊了,浪费时间",
+ "哈哈哈,太有趣了"
+ ]
+
+ for text in test_texts:
+ pred, conf = model.predict_single(text)
+ sentiment = "正面" if pred == 1 else "负面"
+ print(f"文本: {text}")
+ print(f"预测: {sentiment} (置信度: {conf:.4f})")
+ print()
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/WeiboSentiment_MachineLearning/bert_train.py b/WeiboSentiment_MachineLearning/bert_train.py
new file mode 100644
index 0000000..2f94b32
--- /dev/null
+++ b/WeiboSentiment_MachineLearning/bert_train.py
@@ -0,0 +1,413 @@
+# -*- coding: utf-8 -*-
+"""
+BERT情感分析模型训练脚本
+"""
+import argparse
+import os
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from transformers import BertTokenizer, BertModel
+from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
+from typing import List, Tuple
+import warnings
+import requests
+from pathlib import Path
+
+from base_model import BaseModel
+from utils import load_corpus_bert
+
+# 忽略transformers的警告
+warnings.filterwarnings("ignore")
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+
+class BertDataset(Dataset):
+ """BERT数据集"""
+
+ def __init__(self, data: List[Tuple[str, int]]):
+ self.data = [item[0] for item in data]
+ self.labels = [item[1] for item in data]
+
+ def __getitem__(self, index):
+ return self.data[index], self.labels[index]
+
+ def __len__(self):
+ return len(self.labels)
+
+
+class BertClassifier(nn.Module):
+ """BERT分类器网络"""
+
+ def __init__(self, input_size):
+ super(BertClassifier, self).__init__()
+ self.fc = nn.Linear(input_size, 1)
+ self.sigmoid = nn.Sigmoid()
+
+ def forward(self, x):
+ out = self.fc(x)
+ out = self.sigmoid(out)
+ return out
+
+
+class BertModel_Custom(BaseModel):
+ """BERT情感分析模型"""
+
+ def __init__(self, model_path: str = "./model/chinese_wwm_pytorch"):
+ super().__init__("BERT")
+ self.model_path = model_path
+ self.tokenizer = None
+ self.bert = None
+ self.classifier = None
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+ def _download_bert_model(self):
+ """自动下载BERT预训练模型"""
+ print(f"BERT模型不存在,正在下载中文BERT预训练模型...")
+ print("下载来源: bert-base-chinese (Hugging Face)")
+
+ try:
+ # 创建模型目录
+ os.makedirs(self.model_path, exist_ok=True)
+
+ # 使用Hugging Face的中文BERT模型
+ model_name = "bert-base-chinese"
+ print(f"正在从Hugging Face下载 {model_name}...")
+
+ # 下载tokenizer
+ print("下载分词器...")
+ tokenizer = BertTokenizer.from_pretrained(model_name)
+ tokenizer.save_pretrained(self.model_path)
+
+ # 下载模型
+ print("下载BERT模型...")
+ bert_model = BertModel.from_pretrained(model_name)
+ bert_model.save_pretrained(self.model_path)
+
+ print(f"✅ BERT模型下载完成,保存在: {self.model_path}")
+ return True
+
+ except Exception as e:
+ print(f"❌ BERT模型下载失败: {e}")
+ print("\n💡 您可以手动下载BERT模型:")
+ print("1. 访问 https://huggingface.co/bert-base-chinese")
+ print("2. 或使用哈工大中文BERT: https://github.com/ymcui/Chinese-BERT-wwm")
+ print(f"3. 将模型文件解压到: {self.model_path}")
+ return False
+
+ def _load_bert(self):
+ """加载BERT模型和分词器"""
+ print(f"加载BERT模型: {self.model_path}")
+
+ # 如果模型不存在,尝试自动下载
+ if not os.path.exists(self.model_path) or not any(os.scandir(self.model_path)):
+ print("BERT模型不存在,尝试自动下载...")
+ if not self._download_bert_model():
+ raise FileNotFoundError(f"BERT模型下载失败,请手动下载到: {self.model_path}")
+
+ try:
+ self.tokenizer = BertTokenizer.from_pretrained(self.model_path)
+ self.bert = BertModel.from_pretrained(self.model_path).to(self.device)
+
+ # 冻结BERT参数
+ for param in self.bert.parameters():
+ param.requires_grad = False
+
+ print("✅ BERT模型加载完成")
+
+ except Exception as e:
+ print(f"❌ BERT模型加载失败: {e}")
+ print("尝试使用在线模型...")
+
+ # 如果本地加载失败,尝试直接使用在线模型
+ try:
+ model_name = "bert-base-chinese"
+ self.tokenizer = BertTokenizer.from_pretrained(model_name)
+ self.bert = BertModel.from_pretrained(model_name).to(self.device)
+
+ # 冻结BERT参数
+ for param in self.bert.parameters():
+ param.requires_grad = False
+
+ print("✅ 在线BERT模型加载完成")
+
+ except Exception as e2:
+ print(f"❌ 在线模型也加载失败: {e2}")
+ raise FileNotFoundError(f"无法加载BERT模型,请检查网络连接或手动下载模型到: {self.model_path}")
+
+ def train(self, train_data: List[Tuple[str, int]], **kwargs) -> None:
+ """训练BERT模型"""
+ print(f"开始训练 {self.model_name} 模型...")
+
+ # 加载BERT
+ self._load_bert()
+
+ # 超参数
+ learning_rate = kwargs.get('learning_rate', 1e-3)
+ num_epochs = kwargs.get('num_epochs', 10)
+ batch_size = kwargs.get('batch_size', 100)
+ input_size = kwargs.get('input_size', 768)
+ decay_rate = kwargs.get('decay_rate', 0.9)
+
+ print(f"BERT超参数: lr={learning_rate}, epochs={num_epochs}, "
+ f"batch_size={batch_size}, input_size={input_size}")
+
+ # 创建数据集
+ train_dataset = BertDataset(train_data)
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+
+ # 创建分类器
+ self.classifier = BertClassifier(input_size).to(self.device)
+
+ # 损失函数和优化器
+ criterion = nn.BCELoss()
+ optimizer = torch.optim.Adam(self.classifier.parameters(), lr=learning_rate)
+ scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=decay_rate)
+
+ # 训练循环
+ self.bert.eval() # BERT始终保持评估模式
+ self.classifier.train()
+
+ for epoch in range(num_epochs):
+ total_loss = 0
+ num_batches = 0
+
+ for i, (words, labels) in enumerate(train_loader):
+ # 分词和编码
+ tokens = self.tokenizer(words, padding=True, truncation=True,
+ max_length=512, return_tensors='pt')
+ input_ids = tokens["input_ids"].to(self.device)
+ attention_mask = tokens["attention_mask"].to(self.device)
+ labels = torch.tensor(labels, dtype=torch.float32).to(self.device)
+
+ # 获取BERT输出(冻结参数)
+ with torch.no_grad():
+ bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
+ bert_output = bert_outputs[0][:, 0] # [CLS] token的输出
+
+ # 分类器前向传播
+ optimizer.zero_grad()
+ outputs = self.classifier(bert_output)
+ logits = outputs.view(-1)
+ loss = criterion(logits, labels)
+
+ # 反向传播
+ loss.backward()
+ optimizer.step()
+
+ total_loss += loss.item()
+ num_batches += 1
+
+ if (i + 1) % 10 == 0:
+ avg_loss = total_loss / num_batches
+ print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {avg_loss:.4f}")
+ total_loss = 0
+ num_batches = 0
+
+ # 学习率衰减
+ scheduler.step()
+
+ # 保存每个epoch的模型
+ if kwargs.get('save_each_epoch', False):
+ epoch_model_path = f"./model/bert_epoch_{epoch+1}.pth"
+ os.makedirs(os.path.dirname(epoch_model_path), exist_ok=True)
+ torch.save(self.classifier.state_dict(), epoch_model_path)
+ print(f"已保存模型: {epoch_model_path}")
+
+ self.is_trained = True
+ print(f"{self.model_name} 模型训练完成!")
+
+ def predict(self, texts: List[str]) -> List[int]:
+ """预测文本情感"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ predictions = []
+ batch_size = 32
+
+ self.bert.eval()
+ self.classifier.eval()
+
+ with torch.no_grad():
+ for i in range(0, len(texts), batch_size):
+ batch_texts = texts[i:i+batch_size]
+
+ # 分词和编码
+ tokens = self.tokenizer(batch_texts, padding=True, truncation=True,
+ max_length=512, return_tensors='pt')
+ input_ids = tokens["input_ids"].to(self.device)
+ attention_mask = tokens["attention_mask"].to(self.device)
+
+ # 获取BERT输出
+ bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
+ bert_output = bert_outputs[0][:, 0]
+
+ # 分类器预测
+ outputs = self.classifier(bert_output)
+ outputs = outputs.view(-1)
+
+ # 转换为类别标签
+ preds = (outputs > 0.5).cpu().numpy()
+ predictions.extend(preds.astype(int).tolist())
+
+ return predictions
+
+ def predict_single(self, text: str) -> Tuple[int, float]:
+ """预测单条文本的情感"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ self.bert.eval()
+ self.classifier.eval()
+
+ with torch.no_grad():
+ # 分词和编码
+ tokens = self.tokenizer([text], padding=True, truncation=True,
+ max_length=512, return_tensors='pt')
+ input_ids = tokens["input_ids"].to(self.device)
+ attention_mask = tokens["attention_mask"].to(self.device)
+
+ # 获取BERT输出
+ bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
+ bert_output = bert_outputs[0][:, 0]
+
+ # 分类器预测
+ output = self.classifier(bert_output)
+ prob = output.item()
+
+ prediction = int(prob > 0.5)
+ confidence = prob if prediction == 1 else 1 - prob
+
+ return prediction, confidence
+
+ def save_model(self, model_path: str = None) -> None:
+ """保存模型"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,无法保存")
+
+ if model_path is None:
+ model_path = f"./model/{self.model_name.lower()}_model.pth"
+
+ os.makedirs(os.path.dirname(model_path), exist_ok=True)
+
+ # 保存分类器和相关信息
+ model_data = {
+ 'classifier_state_dict': self.classifier.state_dict(),
+ 'model_path': self.model_path,
+ 'input_size': 768,
+ 'device': str(self.device)
+ }
+
+ torch.save(model_data, model_path)
+ print(f"模型已保存到: {model_path}")
+
+ def load_model(self, model_path: str) -> None:
+ """加载模型"""
+ if not os.path.exists(model_path):
+ raise FileNotFoundError(f"模型文件不存在: {model_path}")
+
+ model_data = torch.load(model_path, map_location=self.device)
+
+ # 设置BERT模型路径
+ self.model_path = model_data['model_path']
+
+ # 加载BERT
+ self._load_bert()
+
+ # 重建分类器
+ input_size = model_data['input_size']
+ self.classifier = BertClassifier(input_size).to(self.device)
+
+ # 加载分类器权重
+ self.classifier.load_state_dict(model_data['classifier_state_dict'])
+
+ self.is_trained = True
+ print(f"已加载模型: {model_path}")
+
+ @staticmethod
+ def load_data(train_path: str, test_path: str) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
+ """加载BERT格式的数据"""
+ print("加载训练数据...")
+ train_data = load_corpus_bert(train_path)
+ print(f"训练数据量: {len(train_data)}")
+
+ print("加载测试数据...")
+ test_data = load_corpus_bert(test_path)
+ print(f"测试数据量: {len(test_data)}")
+
+ return train_data, test_data
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(description='BERT情感分析模型训练')
+ parser.add_argument('--train_path', type=str, default='./data/weibo2018/train.txt',
+ help='训练数据路径')
+ parser.add_argument('--test_path', type=str, default='./data/weibo2018/test.txt',
+ help='测试数据路径')
+ parser.add_argument('--model_path', type=str, default='./model/bert_model.pth',
+ help='模型保存路径')
+ parser.add_argument('--bert_path', type=str, default='./model/chinese_wwm_pytorch',
+ help='BERT预训练模型路径')
+ parser.add_argument('--epochs', type=int, default=10,
+ help='训练轮数')
+ parser.add_argument('--batch_size', type=int, default=100,
+ help='批大小')
+ parser.add_argument('--learning_rate', type=float, default=1e-3,
+ help='学习率')
+ parser.add_argument('--eval_only', action='store_true',
+ help='仅评估已有模型,不进行训练')
+
+ args = parser.parse_args()
+
+ # 创建模型
+ model = BertModel_Custom(args.bert_path)
+
+ if args.eval_only:
+ # 仅评估模式
+ print("评估模式:加载已有模型进行评估")
+ model.load_model(args.model_path)
+
+ # 加载测试数据
+ _, test_data = model.load_data(args.train_path, args.test_path)
+
+ # 评估模型
+ model.evaluate(test_data)
+ else:
+ # 训练模式
+ # 加载数据
+ train_data, test_data = model.load_data(args.train_path, args.test_path)
+
+ # 训练模型
+ model.train(
+ train_data,
+ num_epochs=args.epochs,
+ batch_size=args.batch_size,
+ learning_rate=args.learning_rate
+ )
+
+ # 评估模型
+ model.evaluate(test_data)
+
+ # 保存模型
+ model.save_model(args.model_path)
+
+ # 示例预测
+ print("\n示例预测:")
+ test_texts = [
+ "今天天气真好,心情很棒",
+ "这部电影太无聊了,浪费时间",
+ "哈哈哈,太有趣了"
+ ]
+
+ for text in test_texts:
+ pred, conf = model.predict_single(text)
+ sentiment = "正面" if pred == 1 else "负面"
+ print(f"文本: {text}")
+ print(f"预测: {sentiment} (置信度: {conf:.4f})")
+ print()
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/WeiboSentiment_MachineLearning/lstm_train.py b/WeiboSentiment_MachineLearning/lstm_train.py
new file mode 100644
index 0000000..b08417b
--- /dev/null
+++ b/WeiboSentiment_MachineLearning/lstm_train.py
@@ -0,0 +1,352 @@
+# -*- coding: utf-8 -*-
+"""
+LSTM情感分析模型训练脚本
+"""
+import argparse
+import os
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
+from gensim import models
+from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
+from typing import List, Tuple, Dict, Any
+import numpy as np
+
+from base_model import BaseModel
+
+
+class LSTMDataset(Dataset):
+ """LSTM数据集"""
+
+ def __init__(self, data: List[Tuple[str, int]], word2vec_model):
+ self.data = []
+ self.label = []
+
+ for text, label in data:
+ vectors = []
+ for word in text.split(" "):
+ if word in word2vec_model.wv.key_to_index:
+ vectors.append(word2vec_model.wv[word])
+
+ if len(vectors) > 0: # 确保有有效的词向量
+ vectors = torch.Tensor(vectors)
+ self.data.append(vectors)
+ self.label.append(label)
+
+ def __getitem__(self, index):
+ return self.data[index], self.label[index]
+
+ def __len__(self):
+ return len(self.label)
+
+
+def collate_fn(data):
+ """批处理函数"""
+ data.sort(key=lambda x: len(x[0]), reverse=True)
+ data_length = [len(sq[0]) for sq in data]
+ x = [i[0] for i in data]
+ y = [i[1] for i in data]
+ data = pad_sequence(x, batch_first=True, padding_value=0)
+ return data, torch.tensor(y, dtype=torch.float32), data_length
+
+
+class LSTMNet(nn.Module):
+ """LSTM网络结构"""
+
+ def __init__(self, input_size, hidden_size, num_layers):
+ super(LSTMNet, self).__init__()
+ self.hidden_size = hidden_size
+ self.num_layers = num_layers
+ self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
+ batch_first=True, bidirectional=True)
+ self.fc = nn.Linear(hidden_size * 2, 1) # 双向LSTM
+ self.sigmoid = nn.Sigmoid()
+
+ def forward(self, x, lengths):
+ device = x.device
+ h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
+ c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
+
+ packed_input = pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
+ packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))
+
+ # 双向LSTM,拼接最后的隐藏状态
+ lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)
+ out = self.fc(lstm_out)
+ out = self.sigmoid(out)
+ return out
+
+
+class LSTMModel(BaseModel):
+ """LSTM情感分析模型"""
+
+ def __init__(self):
+ super().__init__("LSTM")
+ self.word2vec_model = None
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+ def _train_word2vec(self, train_data: List[Tuple[str, int]], **kwargs):
+ """训练Word2Vec词向量"""
+ print("训练Word2Vec词向量...")
+
+ # 准备Word2Vec输入数据
+ wv_input = [text.split(" ") for text, _ in train_data]
+
+ vector_size = kwargs.get('vector_size', 64)
+ min_count = kwargs.get('min_count', 1)
+ epochs = kwargs.get('epochs', 1000)
+
+ # 训练Word2Vec
+ self.word2vec_model = models.Word2Vec(
+ wv_input,
+ vector_size=vector_size,
+ min_count=min_count,
+ epochs=epochs
+ )
+
+ print(f"Word2Vec训练完成,词向量维度: {vector_size}")
+
+ def train(self, train_data: List[Tuple[str, int]], **kwargs) -> None:
+ """训练LSTM模型"""
+ print(f"开始训练 {self.model_name} 模型...")
+
+ # 训练Word2Vec
+ self._train_word2vec(train_data, **kwargs)
+
+ # 超参数
+ learning_rate = kwargs.get('learning_rate', 5e-4)
+ num_epochs = kwargs.get('num_epochs', 5)
+ batch_size = kwargs.get('batch_size', 100)
+ embed_size = kwargs.get('embed_size', 64)
+ hidden_size = kwargs.get('hidden_size', 64)
+ num_layers = kwargs.get('num_layers', 2)
+
+ print(f"LSTM超参数: lr={learning_rate}, epochs={num_epochs}, "
+ f"batch_size={batch_size}, hidden_size={hidden_size}")
+
+ # 创建数据集
+ train_dataset = LSTMDataset(train_data, self.word2vec_model)
+ train_loader = DataLoader(train_dataset, batch_size=batch_size,
+ collate_fn=collate_fn, shuffle=True)
+
+ # 创建模型
+ self.model = LSTMNet(embed_size, hidden_size, num_layers).to(self.device)
+
+ # 损失函数和优化器
+ criterion = nn.BCELoss()
+ optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
+
+ # 训练循环
+ self.model.train()
+ for epoch in range(num_epochs):
+ total_loss = 0
+ num_batches = 0
+
+ for i, (x, labels, lengths) in enumerate(train_loader):
+ x = x.to(self.device)
+ labels = labels.to(self.device)
+
+ # 前向传播
+ outputs = self.model(x, lengths)
+ logits = outputs.view(-1)
+ loss = criterion(logits, labels)
+
+ # 反向传播
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ total_loss += loss.item()
+ num_batches += 1
+
+ if (i + 1) % 10 == 0:
+ avg_loss = total_loss / num_batches
+ print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {avg_loss:.4f}")
+
+ # 保存每个epoch的模型
+ if kwargs.get('save_each_epoch', False):
+ epoch_model_path = f"./model/lstm_epoch_{epoch+1}.pth"
+ os.makedirs(os.path.dirname(epoch_model_path), exist_ok=True)
+ torch.save(self.model.state_dict(), epoch_model_path)
+ print(f"已保存模型: {epoch_model_path}")
+
+ self.is_trained = True
+ print(f"{self.model_name} 模型训练完成!")
+
+ def predict(self, texts: List[str]) -> List[int]:
+ """预测文本情感"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ # 创建数据集
+ test_data = [(text, 0) for text in texts] # 标签无关紧要
+ test_dataset = LSTMDataset(test_data, self.word2vec_model)
+ test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)
+
+ predictions = []
+ self.model.eval()
+
+ with torch.no_grad():
+ for x, _, lengths in test_loader:
+ x = x.to(self.device)
+ outputs = self.model(x, lengths)
+ outputs = outputs.view(-1)
+
+ # 转换为类别标签
+ preds = (outputs > 0.5).cpu().numpy()
+ predictions.extend(preds.astype(int).tolist())
+
+ return predictions
+
+ def predict_single(self, text: str) -> Tuple[int, float]:
+ """预测单条文本的情感"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ # 转换为词向量
+ vectors = []
+ for word in text.split(" "):
+ if word in self.word2vec_model.wv.key_to_index:
+ vectors.append(self.word2vec_model.wv[word])
+
+ if len(vectors) == 0:
+ return 0, 0.5 # 如果没有有效词向量,返回默认值
+
+ # 转换为tensor
+ x = torch.Tensor(vectors).unsqueeze(0).to(self.device) # 添加batch维度
+ lengths = [len(vectors)]
+
+ self.model.eval()
+ with torch.no_grad():
+ output = self.model(x, lengths)
+ prob = output.item()
+ prediction = int(prob > 0.5)
+ confidence = prob if prediction == 1 else 1 - prob
+
+ return prediction, confidence
+
+ def save_model(self, model_path: str = None) -> None:
+ """保存模型"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,无法保存")
+
+ if model_path is None:
+ model_path = f"./model/{self.model_name.lower()}_model.pth"
+
+ os.makedirs(os.path.dirname(model_path), exist_ok=True)
+
+ # 保存模型状态和Word2Vec
+ model_data = {
+ 'model_state_dict': self.model.state_dict(),
+ 'word2vec_model': self.word2vec_model,
+ 'model_config': {
+ 'embed_size': 64,
+ 'hidden_size': 64,
+ 'num_layers': 2
+ },
+ 'device': str(self.device)
+ }
+
+ torch.save(model_data, model_path)
+ print(f"模型已保存到: {model_path}")
+
+ def load_model(self, model_path: str) -> None:
+ """加载模型"""
+ if not os.path.exists(model_path):
+ raise FileNotFoundError(f"模型文件不存在: {model_path}")
+
+ model_data = torch.load(model_path, map_location=self.device)
+
+ # 加载Word2Vec
+ self.word2vec_model = model_data['word2vec_model']
+
+ # 重建LSTM网络
+ config = model_data['model_config']
+ self.model = LSTMNet(
+ config['embed_size'],
+ config['hidden_size'],
+ config['num_layers']
+ ).to(self.device)
+
+ # 加载模型权重
+ self.model.load_state_dict(model_data['model_state_dict'])
+
+ self.is_trained = True
+ print(f"已加载模型: {model_path}")
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(description='LSTM情感分析模型训练')
+ parser.add_argument('--train_path', type=str, default='./data/weibo2018/train.txt',
+ help='训练数据路径')
+ parser.add_argument('--test_path', type=str, default='./data/weibo2018/test.txt',
+ help='测试数据路径')
+ parser.add_argument('--model_path', type=str, default='./model/lstm_model.pth',
+ help='模型保存路径')
+ parser.add_argument('--epochs', type=int, default=5,
+ help='训练轮数')
+ parser.add_argument('--batch_size', type=int, default=100,
+ help='批大小')
+ parser.add_argument('--hidden_size', type=int, default=64,
+ help='LSTM隐藏层大小')
+ parser.add_argument('--learning_rate', type=float, default=5e-4,
+ help='学习率')
+ parser.add_argument('--eval_only', action='store_true',
+ help='仅评估已有模型,不进行训练')
+
+ args = parser.parse_args()
+
+ # 创建模型
+ model = LSTMModel()
+
+ if args.eval_only:
+ # 仅评估模式
+ print("评估模式:加载已有模型进行评估")
+ model.load_model(args.model_path)
+
+ # 加载测试数据
+ _, test_data = BaseModel.load_data(args.train_path, args.test_path)
+
+ # 评估模型
+ model.evaluate(test_data)
+ else:
+ # 训练模式
+ # 加载数据
+ train_data, test_data = BaseModel.load_data(args.train_path, args.test_path)
+
+ # 训练模型
+ model.train(
+ train_data,
+ num_epochs=args.epochs,
+ batch_size=args.batch_size,
+ hidden_size=args.hidden_size,
+ learning_rate=args.learning_rate
+ )
+
+ # 评估模型
+ model.evaluate(test_data)
+
+ # 保存模型
+ model.save_model(args.model_path)
+
+ # 示例预测
+ print("\n示例预测:")
+ test_texts = [
+ "今天天气真好,心情很棒",
+ "这部电影太无聊了,浪费时间",
+ "哈哈哈,太有趣了"
+ ]
+
+ for text in test_texts:
+ pred, conf = model.predict_single(text)
+ sentiment = "正面" if pred == 1 else "负面"
+ print(f"文本: {text}")
+ print(f"预测: {sentiment} (置信度: {conf:.4f})")
+ print()
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/WeiboSentiment_MachineLearning/model/bayes_model.pkl b/WeiboSentiment_MachineLearning/model/bayes_model.pkl
new file mode 100644
index 0000000..94c2a74
Binary files /dev/null and b/WeiboSentiment_MachineLearning/model/bayes_model.pkl differ
diff --git a/WeiboSentiment_MachineLearning/model/bert_dnn_8.model b/WeiboSentiment_MachineLearning/model/bert_dnn_8.model
deleted file mode 100644
index 20b5bd6..0000000
Binary files a/WeiboSentiment_MachineLearning/model/bert_dnn_8.model and /dev/null differ
diff --git a/WeiboSentiment_MachineLearning/model/bert_model.pth b/WeiboSentiment_MachineLearning/model/bert_model.pth
new file mode 100644
index 0000000..be057fe
Binary files /dev/null and b/WeiboSentiment_MachineLearning/model/bert_model.pth differ
diff --git a/WeiboSentiment_MachineLearning/model/lstm_5.model b/WeiboSentiment_MachineLearning/model/lstm_5.model
deleted file mode 100644
index ffdfb80..0000000
Binary files a/WeiboSentiment_MachineLearning/model/lstm_5.model and /dev/null differ
diff --git a/WeiboSentiment_MachineLearning/model/lstm_model.pth b/WeiboSentiment_MachineLearning/model/lstm_model.pth
new file mode 100644
index 0000000..733d21d
Binary files /dev/null and b/WeiboSentiment_MachineLearning/model/lstm_model.pth differ
diff --git a/WeiboSentiment_MachineLearning/model/svm_model.pkl b/WeiboSentiment_MachineLearning/model/svm_model.pkl
new file mode 100644
index 0000000..4b88b5b
Binary files /dev/null and b/WeiboSentiment_MachineLearning/model/svm_model.pkl differ
diff --git a/WeiboSentiment_MachineLearning/model/xgboost_model.pkl b/WeiboSentiment_MachineLearning/model/xgboost_model.pkl
new file mode 100644
index 0000000..245dbe2
Binary files /dev/null and b/WeiboSentiment_MachineLearning/model/xgboost_model.pkl differ
diff --git a/WeiboSentiment_MachineLearning/predict.py b/WeiboSentiment_MachineLearning/predict.py
new file mode 100644
index 0000000..12ef908
--- /dev/null
+++ b/WeiboSentiment_MachineLearning/predict.py
@@ -0,0 +1,310 @@
+# -*- coding: utf-8 -*-
+"""
+统一的情感分析预测程序
+支持加载所有模型进行情感预测
+"""
+import argparse
+import os
+import re
+from typing import Dict, Tuple, List
+import warnings
+warnings.filterwarnings("ignore")
+
+# 导入所有模型类
+from bayes_train import BayesModel
+from svm_train import SVMModel
+from xgboost_train import XGBoostModel
+from lstm_train import LSTMModel
+from bert_train import BertModel_Custom
+from utils import processing
+
+
+class SentimentPredictor:
+ """情感分析预测器"""
+
+ def __init__(self):
+ self.models = {}
+ self.available_models = {
+ 'bayes': BayesModel,
+ 'svm': SVMModel,
+ 'xgboost': XGBoostModel,
+ 'lstm': LSTMModel,
+ 'bert': BertModel_Custom
+ }
+
+ def load_model(self, model_type: str, model_path: str, **kwargs) -> None:
+ """加载指定类型的模型
+
+ Args:
+ model_type: 模型类型 ('bayes', 'svm', 'xgboost', 'lstm', 'bert')
+ model_path: 模型文件路径
+ **kwargs: 其他参数(如BERT的预训练模型路径)
+ """
+ if model_type not in self.available_models:
+ raise ValueError(f"不支持的模型类型: {model_type}")
+
+ if not os.path.exists(model_path):
+ print(f"警告: 模型文件不存在: {model_path}")
+ return
+
+ print(f"加载 {model_type.upper()} 模型...")
+
+ try:
+ if model_type == 'bert':
+ # BERT需要额外的预训练模型路径
+ bert_path = kwargs.get('bert_path', './model/chinese_wwm_pytorch')
+ model = BertModel_Custom(bert_path)
+ else:
+ model = self.available_models[model_type]()
+
+ model.load_model(model_path)
+ self.models[model_type] = model
+ print(f"{model_type.upper()} 模型加载成功")
+
+ except Exception as e:
+ print(f"加载 {model_type.upper()} 模型失败: {e}")
+
+ def load_all_models(self, model_dir: str = './model', bert_path: str = './model/chinese_wwm_pytorch') -> None:
+ """加载所有可用的模型
+
+ Args:
+ model_dir: 模型文件目录
+ bert_path: BERT预训练模型路径
+ """
+ model_files = {
+ 'bayes': os.path.join(model_dir, 'bayes_model.pkl'),
+ 'svm': os.path.join(model_dir, 'svm_model.pkl'),
+ 'xgboost': os.path.join(model_dir, 'xgboost_model.pkl'),
+ 'lstm': os.path.join(model_dir, 'lstm_model.pth'),
+ 'bert': os.path.join(model_dir, 'bert_model.pth')
+ }
+
+ print("开始加载所有可用模型...")
+ for model_type, model_path in model_files.items():
+ self.load_model(model_type, model_path, bert_path=bert_path)
+
+ print(f"\n已加载 {len(self.models)} 个模型: {list(self.models.keys())}")
+
+ def predict_single(self, text: str, model_type: str = None) -> Dict[str, Tuple[int, float]]:
+ """预测单条文本的情感
+
+ Args:
+ text: 待预测文本
+ model_type: 指定模型类型,如果为None则使用所有已加载的模型
+
+ Returns:
+ Dict[model_type, (prediction, confidence)]
+ """
+ # 文本预处理
+ processed_text = processing(text)
+
+ if model_type:
+ if model_type not in self.models:
+ raise ValueError(f"模型 {model_type} 未加载")
+
+ prediction, confidence = self.models[model_type].predict_single(processed_text)
+ return {model_type: (prediction, confidence)}
+
+ # 使用所有模型预测
+ results = {}
+ for name, model in self.models.items():
+ try:
+ prediction, confidence = model.predict_single(processed_text)
+ results[name] = (prediction, confidence)
+ except Exception as e:
+ print(f"模型 {name} 预测失败: {e}")
+ results[name] = (0, 0.0)
+
+ return results
+
+ def predict_batch(self, texts: List[str], model_type: str = None) -> Dict[str, List[int]]:
+ """批量预测文本情感
+
+ Args:
+ texts: 待预测文本列表
+ model_type: 指定模型类型,如果为None则使用所有已加载的模型
+
+ Returns:
+ Dict[model_type, predictions]
+ """
+ # 文本预处理
+ processed_texts = [processing(text) for text in texts]
+
+ if model_type:
+ if model_type not in self.models:
+ raise ValueError(f"模型 {model_type} 未加载")
+
+ predictions = self.models[model_type].predict(processed_texts)
+ return {model_type: predictions}
+
+ # 使用所有模型预测
+ results = {}
+ for name, model in self.models.items():
+ try:
+ predictions = model.predict(processed_texts)
+ results[name] = predictions
+ except Exception as e:
+ print(f"模型 {name} 预测失败: {e}")
+ results[name] = [0] * len(texts)
+
+ return results
+
+ def ensemble_predict(self, text: str, weights: Dict[str, float] = None) -> Tuple[int, float]:
+ """集成预测(多个模型投票)
+
+ Args:
+ text: 待预测文本
+ weights: 模型权重,如果为None则平均权重
+
+ Returns:
+ (prediction, confidence)
+ """
+ if len(self.models) == 0:
+ raise ValueError("没有加载任何模型")
+
+ results = self.predict_single(text)
+
+ if weights is None:
+ weights = {name: 1.0 for name in results.keys()}
+
+ # 加权平均
+ total_weight = 0
+ weighted_prob = 0
+
+ for model_name, (pred, conf) in results.items():
+ if model_name in weights:
+ weight = weights[model_name]
+ prob = conf if pred == 1 else 1 - conf
+ weighted_prob += prob * weight
+ total_weight += weight
+
+ if total_weight == 0:
+ return 0, 0.5
+
+ final_prob = weighted_prob / total_weight
+ final_pred = int(final_prob > 0.5)
+ final_conf = final_prob if final_pred == 1 else 1 - final_prob
+
+ return final_pred, final_conf
+
+ def interactive_predict(self):
+ """交互式预测模式"""
+ if len(self.models) == 0:
+ print("错误: 没有加载任何模型,请先加载模型")
+ return
+
+ print("\n" + "="*50)
+ print("="*50)
+ print(f"已加载模型: {', '.join(self.models.keys())}")
+ print("输入 'q' 退出程序")
+ print("输入 'models' 查看模型列表")
+ print("输入 'ensemble' 使用集成预测")
+ print("-"*50)
+
+ while True:
+ try:
+ text = input("\n请输入要分析的微博内容: ").strip()
+
+ if text.lower() == 'q':
+ print("👋 再见!")
+ break
+
+ if text.lower() == 'models':
+ print(f"已加载模型: {list(self.models.keys())}")
+ continue
+
+ if text.lower() == 'ensemble':
+ if len(self.models) > 1:
+ pred, conf = self.ensemble_predict(text)
+ sentiment = "😊 正面" if pred == 1 else "😞 负面"
+ print(f"\n🤖 集成预测结果:")
+ print(f" 情感倾向: {sentiment}")
+ print(f" 置信度: {conf:.4f}")
+ else:
+ print("❌ 集成预测需要至少2个模型")
+ continue
+
+ if not text:
+ print("❌ 请输入有效内容")
+ continue
+
+ # 预测
+ results = self.predict_single(text)
+
+ print(f"\n📝 原文: {text}")
+ print("🔍 预测结果:")
+
+ for model_name, (pred, conf) in results.items():
+ sentiment = "😊 正面" if pred == 1 else "😞 负面"
+ print(f" {model_name.upper():8}: {sentiment} (置信度: {conf:.4f})")
+
+ # 如果有多个模型,显示集成结果
+ if len(results) > 1:
+ ensemble_pred, ensemble_conf = self.ensemble_predict(text)
+ ensemble_sentiment = "😊 正面" if ensemble_pred == 1 else "😞 负面"
+ print(f" {'集成':8}: {ensemble_sentiment} (置信度: {ensemble_conf:.4f})")
+
+ except KeyboardInterrupt:
+ print("\n\n👋 程序被中断,再见!")
+ break
+ except Exception as e:
+ print(f"❌ 预测过程中出现错误: {e}")
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(description='微博情感分析统一预测程序')
+ parser.add_argument('--model_dir', type=str, default='./model',
+ help='模型文件目录')
+ parser.add_argument('--bert_path', type=str, default='./model/chinese_wwm_pytorch',
+ help='BERT预训练模型路径')
+ parser.add_argument('--model_type', type=str, choices=['bayes', 'svm', 'xgboost', 'lstm', 'bert'],
+ help='指定单个模型类型进行预测')
+ parser.add_argument('--text', type=str,
+ help='直接预测指定文本')
+ parser.add_argument('--interactive', action='store_true', default=True,
+ help='交互式预测模式(默认)')
+ parser.add_argument('--ensemble', action='store_true',
+ help='使用集成预测')
+
+ args = parser.parse_args()
+
+ # 创建预测器
+ predictor = SentimentPredictor()
+
+ # 加载模型
+ if args.model_type:
+ # 加载指定模型
+ model_files = {
+ 'bayes': 'bayes_model.pkl',
+ 'svm': 'svm_model.pkl',
+ 'xgboost': 'xgboost_model.pkl',
+ 'lstm': 'lstm_model.pth',
+ 'bert': 'bert_model.pth'
+ }
+ model_path = os.path.join(args.model_dir, model_files[args.model_type])
+ predictor.load_model(args.model_type, model_path, bert_path=args.bert_path)
+ else:
+ # 加载所有模型
+ predictor.load_all_models(args.model_dir, args.bert_path)
+
+ # 如果指定了文本,直接预测
+ if args.text:
+ if args.ensemble and len(predictor.models) > 1:
+ pred, conf = predictor.ensemble_predict(args.text)
+ sentiment = "正面" if pred == 1 else "负面"
+ print(f"文本: {args.text}")
+ print(f"集成预测: {sentiment} (置信度: {conf:.4f})")
+ else:
+ results = predictor.predict_single(args.text, args.model_type)
+ print(f"文本: {args.text}")
+ for model_name, (pred, conf) in results.items():
+ sentiment = "正面" if pred == 1 else "负面"
+ print(f"{model_name.upper()}: {sentiment} (置信度: {conf:.4f})")
+ elif args.interactive:
+ # 交互式模式
+ predictor.interactive_predict()
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/WeiboSentiment_MachineLearning/requirements.txt b/WeiboSentiment_MachineLearning/requirements.txt
index ae6028f..775cf19 100644
--- a/WeiboSentiment_MachineLearning/requirements.txt
+++ b/WeiboSentiment_MachineLearning/requirements.txt
@@ -1,6 +1,9 @@
jieba==0.42.1
-pytorch==1.7.1
-sklearn==0.23.2
-xgboost==1.2.1
-transformers==4.6.1
-gensim==4.0.1
\ No newline at end of file
+torch>=1.7.1
+scikit-learn>=0.23.2
+xgboost>=1.2.1
+transformers>=4.6.1
+gensim>=4.0.1
+pandas>=1.3.0
+numpy>=1.20.0
+tqdm>=4.60.0
\ No newline at end of file
diff --git a/WeiboSentiment_MachineLearning/svm_train.py b/WeiboSentiment_MachineLearning/svm_train.py
new file mode 100644
index 0000000..4afdc02
--- /dev/null
+++ b/WeiboSentiment_MachineLearning/svm_train.py
@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+"""
+SVM情感分析模型训练脚本
+"""
+import argparse
+import pandas as pd
+from typing import List, Tuple
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn import svm
+from sklearn.metrics import accuracy_score, f1_score
+
+from base_model import BaseModel
+from utils import stopwords
+
+
+class SVMModel(BaseModel):
+ """SVM情感分析模型"""
+
+ def __init__(self):
+ super().__init__("SVM")
+
+ def train(self, train_data: List[Tuple[str, int]], **kwargs) -> None:
+ """训练SVM模型
+
+ Args:
+ train_data: 训练数据,格式为[(text, label), ...]
+ **kwargs: 其他参数,支持kernel, C等SVM参数
+ """
+ print(f"开始训练 {self.model_name} 模型...")
+
+ # 准备数据
+ df_train = pd.DataFrame(train_data, columns=["words", "label"])
+
+ # 特征编码(TF-IDF模型)
+ print("构建TF-IDF特征...")
+ self.vectorizer = TfidfVectorizer(
+ token_pattern=r'\[?\w+\]?',
+ stop_words=stopwords
+ )
+
+ X_train = self.vectorizer.fit_transform(df_train["words"])
+ y_train = df_train["label"]
+
+ print(f"特征维度: {X_train.shape[1]}")
+
+ # 获取SVM参数
+ kernel = kwargs.get('kernel', 'rbf')
+ C = kwargs.get('C', 1.0)
+ gamma = kwargs.get('gamma', 'scale')
+
+ # 训练模型
+ print(f"训练SVM分类器 (kernel={kernel}, C={C}, gamma={gamma})...")
+ self.model = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True)
+ self.model.fit(X_train, y_train)
+
+ self.is_trained = True
+ print(f"{self.model_name} 模型训练完成!")
+
+ def predict(self, texts: List[str]) -> List[int]:
+ """预测文本情感
+
+ Args:
+ texts: 待预测文本列表
+
+ Returns:
+ 预测结果列表
+ """
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ # 特征转换
+ X = self.vectorizer.transform(texts)
+
+ # 预测
+ predictions = self.model.predict(X)
+
+ return predictions.tolist()
+
+ def predict_single(self, text: str) -> Tuple[int, float]:
+ """预测单条文本的情感
+
+ Args:
+ text: 待预测文本
+
+ Returns:
+ (predicted_label, confidence)
+ """
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ # 特征转换
+ X = self.vectorizer.transform([text])
+
+ # 预测
+ prediction = self.model.predict(X)[0]
+ probabilities = self.model.predict_proba(X)[0]
+ confidence = max(probabilities)
+
+ return int(prediction), float(confidence)
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(description='SVM情感分析模型训练')
+ parser.add_argument('--train_path', type=str, default='./data/weibo2018/train.txt',
+ help='训练数据路径')
+ parser.add_argument('--test_path', type=str, default='./data/weibo2018/test.txt',
+ help='测试数据路径')
+ parser.add_argument('--model_path', type=str, default='./model/svm_model.pkl',
+ help='模型保存路径')
+ parser.add_argument('--kernel', type=str, default='rbf', choices=['linear', 'poly', 'rbf', 'sigmoid'],
+ help='SVM核函数类型')
+ parser.add_argument('--C', type=float, default=1.0,
+ help='SVM正则化参数C')
+ parser.add_argument('--gamma', type=str, default='scale',
+ help='SVM核函数参数gamma')
+ parser.add_argument('--eval_only', action='store_true',
+ help='仅评估已有模型,不进行训练')
+
+ args = parser.parse_args()
+
+ # 创建模型
+ model = SVMModel()
+
+ if args.eval_only:
+ # 仅评估模式
+ print("评估模式:加载已有模型进行评估")
+ model.load_model(args.model_path)
+
+ # 加载测试数据
+ _, test_data = BaseModel.load_data(args.train_path, args.test_path)
+
+ # 评估模型
+ model.evaluate(test_data)
+ else:
+ # 训练模式
+ # 加载数据
+ train_data, test_data = BaseModel.load_data(args.train_path, args.test_path)
+
+ # 训练模型
+ model.train(train_data, kernel=args.kernel, C=args.C, gamma=args.gamma)
+
+ # 评估模型
+ model.evaluate(test_data)
+
+ # 保存模型
+ model.save_model(args.model_path)
+
+ # 示例预测
+ print("\n示例预测:")
+ test_texts = [
+ "今天天气真好,心情很棒",
+ "这部电影太无聊了,浪费时间",
+ "哈哈哈,太有趣了"
+ ]
+
+ for text in test_texts:
+ pred, conf = model.predict_single(text)
+ sentiment = "正面" if pred == 1 else "负面"
+ print(f"文本: {text}")
+ print(f"预测: {sentiment} (置信度: {conf:.4f})")
+ print()
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/WeiboSentiment_MachineLearning/utils.py b/WeiboSentiment_MachineLearning/utils.py
index 50e7254..8b5c760 100644
--- a/WeiboSentiment_MachineLearning/utils.py
+++ b/WeiboSentiment_MachineLearning/utils.py
@@ -1,12 +1,20 @@
# -*- coding: utf-8 -*-
import jieba
import re
+import os
+import pickle
+from typing import List, Tuple, Any
+# 加载停用词
stopwords = []
-with open("data/stopwords.txt", "r", encoding="utf8") as f:
- for w in f:
- stopwords.append(w.strip())
+stopwords_path = "data/stopwords.txt"
+if os.path.exists(stopwords_path):
+ with open(stopwords_path, "r", encoding="utf8") as f:
+ for w in f:
+ stopwords.append(w.strip())
+else:
+ print(f"警告: 停用词文件 {stopwords_path} 不存在,将使用空停用词列表")
def load_corpus(path):
@@ -66,4 +74,65 @@ def processing_bert(text):
text = re.sub("@.+?( |$)", " ", text) # 去除 @xxx (用户名)
text = re.sub("【.+?】", " ", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的)
text = re.sub("\u200b", " ", text) # '\u200b'是这个数据集中的一个bad case, 不用特别在意
- return text
\ No newline at end of file
+ return text
+
+
+def save_model(model: Any, model_path: str) -> None:
+ """
+ 保存模型到文件
+
+ Args:
+ model: 要保存的模型对象
+ model_path: 保存路径
+ """
+ os.makedirs(os.path.dirname(model_path), exist_ok=True)
+
+ with open(model_path, 'wb') as f:
+ pickle.dump(model, f)
+
+ print(f"模型已保存到: {model_path}")
+
+
+def load_model(model_path: str) -> Any:
+ """
+ 从文件加载模型
+
+ Args:
+ model_path: 模型文件路径
+
+ Returns:
+ 加载的模型对象
+ """
+ if not os.path.exists(model_path):
+ raise FileNotFoundError(f"模型文件不存在: {model_path}")
+
+ with open(model_path, 'rb') as f:
+ model = pickle.load(f)
+
+ print(f"已加载模型: {model_path}")
+ return model
+
+
+def preprocess_text_simple(text: str) -> str:
+ """
+ 简单的文本预处理函数,用于预测时的文本清洗
+
+ Args:
+ text: 原始文本
+
+ Returns:
+ 清洗后的文本
+ """
+ # 数据清洗
+ text = re.sub("\{%.+?%\}", " ", text) # 去除 {%xxx%}
+ text = re.sub("@.+?( |$)", " ", text) # 去除 @xxx
+ text = re.sub("【.+?】", " ", text) # 去除 【xx】
+ text = re.sub("\u200b", " ", text) # 去除特殊字符
+
+ # 删除表情符号
+ text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text)
+
+ # 多个空格合并为一个
+ text = re.sub(r"\s+", " ", text)
+
+ return text.strip()
\ No newline at end of file
diff --git a/WeiboSentiment_MachineLearning/xgboost_train.py b/WeiboSentiment_MachineLearning/xgboost_train.py
new file mode 100644
index 0000000..0c3c137
--- /dev/null
+++ b/WeiboSentiment_MachineLearning/xgboost_train.py
@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+"""
+XGBoost情感分析模型训练脚本
+"""
+import argparse
+import pandas as pd
+import numpy as np
+from typing import List, Tuple
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
+import xgboost as xgb
+
+from base_model import BaseModel
+from utils import stopwords
+
+
+class XGBoostModel(BaseModel):
+ """XGBoost情感分析模型"""
+
+ def __init__(self):
+ super().__init__("XGBoost")
+
+ def train(self, train_data: List[Tuple[str, int]], **kwargs) -> None:
+ """训练XGBoost模型
+
+ Args:
+ train_data: 训练数据,格式为[(text, label), ...]
+ **kwargs: 其他参数,支持XGBoost的各种参数
+ """
+ print(f"开始训练 {self.model_name} 模型...")
+
+ # 准备数据
+ df_train = pd.DataFrame(train_data, columns=["words", "label"])
+
+ # 特征编码(词袋模型,限制特征数量)
+ max_features = kwargs.get('max_features', 2000)
+ print(f"构建词袋模型 (max_features={max_features})...")
+ self.vectorizer = CountVectorizer(
+ token_pattern=r'\[?\w+\]?',
+ stop_words=stopwords,
+ max_features=max_features
+ )
+
+ X_train = self.vectorizer.fit_transform(df_train["words"])
+ y_train = df_train["label"]
+
+ print(f"特征维度: {X_train.shape[1]}")
+
+ # XGBoost参数设置
+ params = {
+ 'booster': kwargs.get('booster', 'gbtree'),
+ 'max_depth': kwargs.get('max_depth', 6),
+ 'scale_pos_weight': kwargs.get('scale_pos_weight', 0.5),
+ 'colsample_bytree': kwargs.get('colsample_bytree', 0.8),
+ 'objective': 'binary:logistic',
+ 'eval_metric': 'error',
+ 'eta': kwargs.get('eta', 0.3),
+ 'nthread': kwargs.get('nthread', 10),
+ }
+
+ num_boost_round = kwargs.get('num_boost_round', 200)
+
+ print(f"训练XGBoost分类器...")
+ print(f"参数: {params}")
+ print(f"迭代轮数: {num_boost_round}")
+
+ # 创建DMatrix
+ dmatrix = xgb.DMatrix(X_train, label=y_train)
+
+ # 训练模型
+ self.model = xgb.train(params, dmatrix, num_boost_round=num_boost_round)
+
+ self.is_trained = True
+ print(f"{self.model_name} 模型训练完成!")
+
+ def predict(self, texts: List[str]) -> List[int]:
+ """预测文本情感
+
+ Args:
+ texts: 待预测文本列表
+
+ Returns:
+ 预测结果列表
+ """
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ # 特征转换
+ X = self.vectorizer.transform(texts)
+
+ # 创建DMatrix
+ dmatrix = xgb.DMatrix(X)
+
+ # 预测概率
+ y_prob = self.model.predict(dmatrix)
+
+ # 转换为类别标签
+ y_pred = (y_prob > 0.5).astype(int)
+
+ return y_pred.tolist()
+
+ def predict_single(self, text: str) -> Tuple[int, float]:
+ """预测单条文本的情感
+
+ Args:
+ text: 待预测文本
+
+ Returns:
+ (predicted_label, confidence)
+ """
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ # 特征转换
+ X = self.vectorizer.transform([text])
+
+ # 创建DMatrix
+ dmatrix = xgb.DMatrix(X)
+
+ # 预测概率
+ prob = self.model.predict(dmatrix)[0]
+
+ # 转换为类别标签和置信度
+ prediction = int(prob > 0.5)
+ confidence = prob if prediction == 1 else 1 - prob
+
+ return prediction, float(confidence)
+
+ def evaluate(self, test_data: List[Tuple[str, int]]) -> dict:
+ """评估模型性能,包含AUC指标"""
+ if not self.is_trained:
+ raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
+
+ texts = [item[0] for item in test_data]
+ labels = [item[1] for item in test_data]
+
+ # 预测类别
+ predictions = self.predict(texts)
+
+ # 预测概率(用于计算AUC)
+ X = self.vectorizer.transform(texts)
+ dmatrix = xgb.DMatrix(X)
+ probabilities = self.model.predict(dmatrix)
+
+ accuracy = accuracy_score(labels, predictions)
+ f1 = f1_score(labels, predictions, average='weighted')
+ auc = roc_auc_score(labels, probabilities)
+
+ print(f"\n{self.model_name} 模型评估结果:")
+ print(f"准确率: {accuracy:.4f}")
+ print(f"F1分数: {f1:.4f}")
+ print(f"AUC: {auc:.4f}")
+
+ return {
+ 'accuracy': accuracy,
+ 'f1_score': f1,
+ 'auc': auc
+ }
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(description='XGBoost情感分析模型训练')
+ parser.add_argument('--train_path', type=str, default='./data/weibo2018/train.txt',
+ help='训练数据路径')
+ parser.add_argument('--test_path', type=str, default='./data/weibo2018/test.txt',
+ help='测试数据路径')
+ parser.add_argument('--model_path', type=str, default='./model/xgboost_model.pkl',
+ help='模型保存路径')
+ parser.add_argument('--max_features', type=int, default=2000,
+ help='最大特征数量')
+ parser.add_argument('--max_depth', type=int, default=6,
+ help='XGBoost最大深度')
+ parser.add_argument('--eta', type=float, default=0.3,
+ help='XGBoost学习率')
+ parser.add_argument('--num_boost_round', type=int, default=200,
+ help='XGBoost迭代轮数')
+ parser.add_argument('--eval_only', action='store_true',
+ help='仅评估已有模型,不进行训练')
+
+ args = parser.parse_args()
+
+ # 创建模型
+ model = XGBoostModel()
+
+ if args.eval_only:
+ # 仅评估模式
+ print("评估模式:加载已有模型进行评估")
+ model.load_model(args.model_path)
+
+ # 加载测试数据
+ _, test_data = BaseModel.load_data(args.train_path, args.test_path)
+
+ # 评估模型
+ model.evaluate(test_data)
+ else:
+ # 训练模式
+ # 加载数据
+ train_data, test_data = BaseModel.load_data(args.train_path, args.test_path)
+
+ # 训练模型
+ model.train(
+ train_data,
+ max_features=args.max_features,
+ max_depth=args.max_depth,
+ eta=args.eta,
+ num_boost_round=args.num_boost_round
+ )
+
+ # 评估模型
+ model.evaluate(test_data)
+
+ # 保存模型
+ model.save_model(args.model_path)
+
+ # 示例预测
+ print("\n示例预测:")
+ test_texts = [
+ "今天天气真好,心情很棒",
+ "这部电影太无聊了,浪费时间",
+ "哈哈哈,太有趣了"
+ ]
+
+ for text in test_texts:
+ pred, conf = model.predict_single(text)
+ sentiment = "正面" if pred == 1 else "负面"
+ print(f"文本: {text}")
+ print(f"预测: {sentiment} (置信度: {conf:.4f})")
+ print()
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file