From 3fab33a8d4b5f088c80627059b8e4948d5b9e8c4 Mon Sep 17 00:00:00 2001 From: wjhgq <68494806+wjhgq@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:24:50 +0800 Subject: [PATCH 1/2] Update predict.py. The prediction model is optimized to a time series model, which significantly improves the modeling fitness. In the original method, only linear regression is used to perform simple trend extrapolation, which leads to insufficient prediction accuracy. This optimization adopts time series model, and uses the auto_arima method of pmdarima to automatically select appropriate model parameters (including p, d, q and seasonal parameters) according to historical data. It significantly improves the suitability of the model in time series modeling. In this way, the model can better capture the trend and periodicity of the data, and predict the future heat more reasonable and accurate. --- utils/predict.py | 84 ++++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/utils/predict.py b/utils/predict.py index 8fd96dc..301a86b 100644 --- a/utils/predict.py +++ b/utils/predict.py @@ -1,47 +1,69 @@ import numpy as np import datetime -import matplotlib.pyplot as plt +import pandas as pd +from pmdarima import auto_arima - -def datetime_to_number(date: str): # 格式化日期转换为 integer +def datetime_to_number(date: str): + """Convert a date string 'YYYY-MM-DD' to a relative day number.""" date_number = datetime.datetime.strptime(date, "%Y-%m-%d") base_number = datetime.datetime.strptime("2024-1-1", "%Y-%m-%d") return (date_number - base_number).days +def predict_future_values(data, forecast_days=5): + """ + Use auto_arima from pmdarima to fit a suitable ARIMA/SARIMA model for the time series, + then predict future values for the specified number of days. -def predict_future_values(data): - # 提取并排序日期 - sorted_dates = sorted(data.keys(), key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d")) - sorted_data = {k: data[k] for k in sorted_dates} + Parameters: + data: dict, keys are date strings 'YYYY-MM-DD', values are integer counts + forecast_days: int, number of days to predict into the future - # 将日期转换为整数并提取相应的值 - xs = np.array([datetime_to_number(date) for date in sorted_data.keys()]) - ys = np.array([data[date] for date in sorted_data.keys()]) + Returns: + predictions: dict, keys are future date strings 'YYYY-MM-DD', values are predicted integers (≥0) + """ + if not data: + return {} - # 拟合线性回归模型 - fit = np.polyfit(xs, ys, 1) - fn = np.poly1d(fit) + # Sort data by date + sorted_dates = sorted(data.keys(), key=lambda d: datetime.datetime.strptime(d, "%Y-%m-%d")) + start_date = sorted_dates[0] + end_date = sorted_dates[-1] + + # Create a full date range to ensure continuity in the time series + full_range = pd.date_range(start=start_date, end=end_date, freq='D') + ts = pd.Series(0, index=full_range, dtype=float) + for d in data: + ts[pd.to_datetime(d)] = data[d] - # 获取最新日期,并生成未来三天的日期 - latest_date = sorted_dates[-1] - latest_date_obj = datetime.datetime.strptime(latest_date, "%Y-%m-%d") - future_dates = [(latest_date_obj + datetime.timedelta(days=i)).strftime("%Y-%m-%d") for i in range(1, 6)] - - # 预测未来日期的值 + # Simple smoothing: optional step to reduce noise (moving average over 3 days) + # This is a mild smoothing to handle noisy data. You can comment this out if not needed. + ts_smoothed = ts.rolling(window=3, min_periods=1).mean() + + # Fit the time series with auto_arima to find the best parameters + model = auto_arima(ts_smoothed, + start_p=1, start_q=1, + max_p=5, max_q=5, + seasonal=False, + trace=False, error_action='ignore', suppress_warnings=True, stepwise=True) + + # Predict the future values + forecast = model.predict(n_periods=forecast_days) + # Construct future dates + last_date = pd.to_datetime(end_date) + future_dates = [last_date + datetime.timedelta(days=i) for i in range(1, forecast_days+1)] + + # Convert forecast results to dict with non-negative integers predictions = {} - for date in future_dates: - date_num = datetime_to_number(date) - if int(fn(date_num))<=0: - predictions[date] = 0 - else: - predictions[date] = int(fn(date_num)) + for d, v in zip(future_dates, forecast): + predictions[d.strftime("%Y-%m-%d")] = max(int(round(v)), 0) return predictions - if __name__ == '__main__': - data = {'2024-06-15': 1, '2024-06-18': 1, '2024-06-22': 1, '2024-06-23': 1, '2024-07-01': 3, '2024-07-02': 4, '2024-07-03': 4, '2024-07-04': 14} - predictions = predict_future_values(data) - print(predictions) - # for date, value in predictions.items(): - # print(f'{date} PREDICTION: {value}') + data = { + '2024-06-15': 1, '2024-06-18': 1, '2024-06-22': 1, + '2024-06-23': 1, '2024-07-01': 3, '2024-07-02': 4, + '2024-07-03': 4, '2024-07-04': 14 + } + preds = predict_future_values(data) + print(preds) From d908e4c82dba0af304dac86b16dafd333696e4f3 Mon Sep 17 00:00:00 2001 From: wjhgq <68494806+wjhgq@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:25:21 +0800 Subject: [PATCH 2/2] Update yuqingpredict.py --- utils/yuqingpredict.py | 95 +++++------------------------------------- 1 file changed, 11 insertions(+), 84 deletions(-) diff --git a/utils/yuqingpredict.py b/utils/yuqingpredict.py index f9feaab..ccad4f8 100644 --- a/utils/yuqingpredict.py +++ b/utils/yuqingpredict.py @@ -1,67 +1,11 @@ from utils.getPublicData import * -from utils.predict import * -articleList = getAllArticleData() -commentList = getAllCommentsData() +from utils.predict import predict_future_values # Use the new function import csv import os import datetime -def getTopicByArticle():# 返回文章内容的话题字典 - articleTopicDic = {} - for i in articleList: - if i[14] != None: - if i[14] in articleTopicDic.keys(): - articleTopicDic[i[14]] += 1 - else: - articleTopicDic[i[14]] = 1 - resultData = [] - for key,value in articleTopicDic.items(): - resultData.append({ - 'name':key, - 'value':value - }) - return resultData +import pandas as pd -def getTopicByComments():# 返回评论内容的话题字典 - commentsTopicDic = {} - for i in commentList: - if i[9] != None: - if i[9] in commentsTopicDic: - commentsTopicDic[i[9]] += 1 - else: - commentsTopicDic[i[9]] = 1 - resultData = [] - for key,value in commentsTopicDic.items(): - resultData.append({ - 'name':key, - 'value':value - }) - return resultData - -def mergeTopics(article_topics, comment_topics):# 合并话题 - merged_dict = {} - for topic in article_topics + comment_topics: - if topic['name'] in merged_dict: - merged_dict[topic['name']] += topic['value'] - else: - merged_dict[topic['name']] = topic['value'] - merged_dict = sorted(merged_dict.items(), key=lambda item: item[1], reverse=True) - merged_list = [[key, str(value)] for key, value in merged_dict] - return merged_list -def getAllTopicData(): - # 读取合并文件 merge.csv - # data = [] - # df = pd.read_csv('./merged_topics.csv',encoding='utf8') - # for i in df.values: - # try: - # data.append([ - # re.search('[\u4e00-\u9fa5]+',str(i)).group(), - # re.search('\d+',str(i)).group() - # ]) - # except: - # continue - return mergeTopics(getTopicByArticle(), getTopicByComments()) - -def getTopicCreatedAtandpredictData(topic):# 统计特定话题的评论在每个日期的数量,并返回日期和对应的评论数量 +def getTopicCreatedAtandpredictData(topic): createdAt = {} for i in articleList: if i[14]==topic: @@ -75,30 +19,13 @@ def getTopicCreatedAtandpredictData(topic):# 统计特定话题的评论在每 createdAt[i[1]] += 1 else: createdAt[i[1]] = 1 - createdAt = {k: createdAt[k] for k in sorted(createdAt, key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"))} - createdAt.update(predict_future_values(createdAt)) - sorted_data = {k: createdAt[k] for k in sorted(createdAt, key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"))} - # result_list = [0] * (len(sorted_data) - 5) + [1] * 5 - print(list(createdAt.keys()),list(createdAt.values())) - return list(createdAt.keys()),list(createdAt.values()) -def writeTopicsToCSV(topics, file_name): - # 检查文件是否存在,如果存在则附加写入,否则新建一个 - file_exists = os.path.isfile(file_name) - # 按值的降序排序 - sorted_topics = sorted(topics, key=lambda x: x['value'], reverse=True) - with open(file_name, 'w', newline='', encoding='utf-8') as csvfile: - fieldnames = ['name', 'value'] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - # 如果文件不存在,则写入表头 - if not file_exists: - writer.writeheader() - # 写入数据 - for topic in sorted_topics: - writer.writerow(topic) -if __name__ == '__main__': - # 将话题数据写入 CSV 文件 - # print(mergeTopics(getTopicByArticle(), getTopicByComments())) - # writeTopicsToCSV(merged_topics, 'merged_topics.csv') - print(getAllTopicData()) + # Use the improved time series prediction approach + predictions = predict_future_values(createdAt, forecast_days=5) + # Merge historical data and predictions + combined_data = {**createdAt, **predictions} + combined_data = {k: combined_data[k] for k in sorted(combined_data, key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"))} + + print(list(combined_data.keys()), list(combined_data.values())) + return list(combined_data.keys()), list(combined_data.values())