saas/test/与上一次运行的数据比较.py

import sys
from datetime import datetime, timedelta, timezone
import os
import pandas as pd
import zipfile
import logging
from pathlib import Path
import json
import requests
from api import API
import time

# ---------------------------- 配置项 ----------------------------

# 保存为CSV文件
output_dir = "output"  # 设置输出目录

# 创建输出目录（如果不存在）
import os
os.makedirs(output_dir, exist_ok=True)

DATA_DIR = "数据快照存储"  # 数据快照存储目录
ARCHIVE_DIR = r"压缩包存储"  # 压缩包存储目录
RETAIN_DAYS = 7  # 保留最近多少天的数据
COMPRESS_FORMAT = "zip"  # 压缩格式
LOG_FILE = "data_monitor.log"  # 日志文件路径
CHANGES_FILE = "changes_summary.csv"  # 变更汇总文件路径
MAX_RETRIES = 3  # 最大重试次数
RETRY_DELAY = 0.5  # 重试延迟时间(秒)


# ---------------------- 初始化日志配置 -----------------------
def setup_logging():
    """配置日志记录"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler(LOG_FILE)
        ]
    )
    return logging.getLogger(__name__)


logger = setup_logging()


# ---------------------- 工具函数 -----------------------
def get_system_agnostic_path(*path_parts):
    """获取跨平台兼容的路径"""
    return str(Path(*path_parts))


def ensure_directory(path):
    """确保目录存在（兼容所有平台）"""
    Path(path).mkdir(parents=True, exist_ok=True)
    logger.debug(f"确保目录存在: {path}")


def get_iso8601_time():
    """获取当前时间的ISO 8601格式字符串 (UTC)"""
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"


def is_first_run_today():
    """判断是否是今天的第一次运行（在指定时间范围内）"""
    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    snapshot_file = get_system_agnostic_path((os.path.join(output_dir, f"{DATA_DIR}.csv")), f"snapshot_{today}.csv")
    widget_file = get_system_agnostic_path((os.path.join(output_dir, f"{DATA_DIR}.csv")), f"all_widgets_{today}.csv")

    # 如果快照文件和完整字段文件都已存在，说明今天已经运行过
    if os.path.exists(snapshot_file) and os.path.exists(widget_file):
        logger.info(f"检测到今日文件已存在: {snapshot_file} 和 {widget_file}")
        return False
    return True


# ---------------------- 数据监控类 -----------------------
class DataMonitor:
    def __init__(self):
        self.execution_time = get_iso8601_time()  # 使用ISO 8601格式
        self.today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
        ensure_directory((os.path.join(output_dir, f"{DATA_DIR}.csv")))
        ensure_directory(os.path.join(output_dir, f"{ARCHIVE_DIR}.csv"))
        self.api_instance = API()
        self.headers = {
            'Authorization': 'Bearer qygHulymo1fekJk4CIZyNKjyQAzG8CFN',
            'Content-Type': 'application/json'
        }
        self.last_data = None  # 存储上次获取的数据用于比较
        self.last_widget_data = None  # 存储上次获取的完整字段数据

    def make_api_request(self, url, payload, method='POST'):
        """带重试机制的API请求"""
        retries = 0
        while retries <= MAX_RETRIES:
            try:
                response = requests.request(
                    method,
                    url,
                    headers=self.headers,
                    data=payload,
                    timeout=30
                )
                response.raise_for_status()
                return response
            except requests.exceptions.RequestException as e:
                retries += 1
                if retries <= MAX_RETRIES:
                    logger.warning(f"请求失败 (尝试 {retries}/{MAX_RETRIES}): {str(e)}")
                    time.sleep(RETRY_DELAY)
                else:
                    logger.error(f"请求失败，已达到最大重试次数 {MAX_RETRIES}")
                    raise
        return None

    def fetch_app_data(self):
        """获取应用数据"""
        url = "https://api.jiandaoyun.com/api/v5/app/list"
        payload = json.dumps({"skip": 0, "limit": 100})

        try:
            response = self.make_api_request(url, payload)
            apps = response.json().get("apps", [])
            all_app_id = pd.DataFrame(apps)
            return all_app_id
        except Exception as e:
            logger.error(f"获取应用数据失败: {str(e)}")
            raise

    def fetch_entry_data(self, app_df):
        """获取表单数据"""
        all_entries = []
        url = "https://api.jiandaoyun.com/api/v5/app/entry/list"

        for _, app_row in app_df.iterrows():
            retries = 0
            while retries <= MAX_RETRIES:
                try:
                    payload = json.dumps({"app_id": app_row['app_id']})
                    response = self.make_api_request(url, payload)
                    entries = response.json().get("forms", [])

                    if entries:
                        entry_df = pd.DataFrame(entries)
                        entry_df['app_id'] = app_row['app_id']
                        all_entries.append(entry_df)
                    break
                except Exception as e:
                    retries += 1
                    if retries > MAX_RETRIES:
                        logger.error(f"获取应用 {app_row['app_id']} 的表单数据失败: {str(e)}")
                        break
                    time.sleep(RETRY_DELAY)

        return pd.concat(all_entries, ignore_index=True) if all_entries else None

    def fetch_widget_data(self, entry_df):
        """获取字段数据"""
        all_widgets = []
        url = "https://api.jiandaoyun.com/api/v5/app/entry/widget/list"

        for _, entry_row in entry_df.iterrows():
            retries = 0
            while retries <= MAX_RETRIES:
                try:
                    payload = json.dumps({
                        "app_id": entry_row['app_id'],
                        "entry_id": entry_row['entry_id']
                    })
                    response = self.make_api_request(url, payload)
                    response_data = response.json()

                    widgets = response_data.get('widgets', [])
                    data_modify_time = response_data.get('dataModifyTime', '')

                    if widgets:
                        widget_df = pd.DataFrame(widgets)
                        widget_df['app_id'] = entry_row['app_id']
                        widget_df['entry_id'] = entry_row['entry_id']
                        widget_df['dataModifyTime'] = data_modify_time
                        all_widgets.append(widget_df)
                    break
                except Exception as e:
                    retries += 1
                    if retries > MAX_RETRIES:
                        logger.error(f"获取表单 {entry_row['entry_id']} 的字段数据失败: {str(e)}")
                        break
                    time.sleep(RETRY_DELAY)

        return pd.concat(all_widgets, ignore_index=True) if all_widgets else None

    def save_all_widgets_data(self, widget_data):
        """保存完整字段数据"""
        try:
            filename = get_system_agnostic_path((os.path.join(output_dir, f"{DATA_DIR}.csv")), f"all_widgets_{self.today}.csv")
            widget_data = widget_data.copy()

            # 使用临时文件确保写入安全
            temp_file = filename + '.tmp'
            widget_data.to_csv(temp_file, index=False)

            # 替换原文件
            if os.path.exists(filename):
                os.remove(filename)
            os.rename(temp_file, filename)

            logger.info(f"成功保存完整字段数据: {filename}")
            return True
        except Exception as e:
            logger.error(f"保存完整字段数据失败: {str(e)}")
            return False

    def fetch_monitor_data(self):
        """获取待监控表单数据"""
        retries = 0
        while retries <= MAX_RETRIES:
            try:
                payload = {"api_key": "6694d3c4fcb69ca9a111a6c4", "entry_id": "6850c044f17c934b3ec01fea"}
                data = self.api_instance.entry_data_list(payload).get("data")
                data_list = pd.DataFrame(data)

                for col in data_list.columns:
                    if data_list[col].apply(lambda x: isinstance(x, (dict, list))).any():
                        data_list[col] = data_list[col].astype(str)
                #data_list.to_csv("监控表单.csv", index=False)
                return data_list.drop_duplicates()
            except Exception as e:
                retries += 1
                if retries > MAX_RETRIES:
                    logger.error(f"获取待监控表单数据失败: {str(e)}")
                    raise
                time.sleep(RETRY_DELAY)
        return None

    def match_widget_data(self, data_list, widget_list):
        """匹配字段数据"""
        try:
            if '_widget_1750122565203' not in data_list.columns:
                raise ValueError("数据列表中缺少 '_widget_1750122565203' 列")

            matched = widget_list[widget_list['entry_id'].isin(data_list['_widget_1750122565203'])]
            logger.info(f"匹配到 {len(matched)} 条字段数据")
            return matched
        except Exception as e:
            logger.error(f"字段数据匹配失败: {str(e)}")
            raise

    def save_daily_snapshot(self, data):
        """保存当日数据快照"""
        try:
            filename = get_system_agnostic_path((os.path.join(output_dir, f"{DATA_DIR}.csv")), f"snapshot_{self.today}.csv")
            data = data.copy()  # 创建副本避免SettingWithCopyWarning
            data['unique_id'] = data['name'].astype(str) + data['app_id'].astype(str)

            if 'dataModifyTime' not in data.columns:
                data['dataModifyTime'] = ''

            # 使用临时文件确保写入安全
            temp_file = filename + '.tmp'
            data.to_csv(temp_file, index=False)

            # 替换原文件
            if os.path.exists(filename):
                os.remove(filename)
            os.rename(temp_file, filename)

            logger.info(f"成功保存今日数据快照: {filename}")
            return True
        except Exception as e:
            logger.error(f"保存数据快照失败: {str(e)}")
            return False

    def archive_old_snapshots(self):
        """归档7天前的数据快照（包括完整字段数据）"""
        try:
            keep_dates = [(datetime.now(timezone.utc) - timedelta(days=i)).strftime("%Y-%m-%d")
                          for i in range(RETAIN_DAYS)]

            # 归档普通数据快照
            all_files = [f for f in os.listdir((os.path.join(output_dir, f"{DATA_DIR}.csv")))
                         if f.startswith("snapshot_") and f.endswith(".csv")]

            # 归档完整字段数据
            widget_files = [f for f in os.listdir((os.path.join(output_dir, f"{DATA_DIR}.csv")))
                            if f.startswith("all_widgets_") and f.endswith(".csv")]

            all_files.extend(widget_files)
            archived_files = 0

            for filename in all_files:
                # 从文件名中提取日期
                if filename.startswith("snapshot_"):
                    date_str = filename[9:-4]
                elif filename.startswith("all_widgets_"):
                    date_str = filename[12:-4]
                else:
                    continue

                if date_str not in keep_dates:
                    year_month = date_str[:7]
                    archive_name = get_system_agnostic_path((os.path.join(output_dir, f"{ARCHIVE_DIR}.csv")), f"snapshots_{year_month}.{COMPRESS_FORMAT}")
                    file_path = get_system_agnostic_path((os.path.join(output_dir, f"{DATA_DIR}.csv")), filename)

                    with zipfile.ZipFile(archive_name, 'a', zipfile.ZIP_DEFLATED) as zipf:
                        zipf.write(file_path, arcname=filename)

                    os.remove(file_path)
                    archived_files += 1
                    logger.debug(f"已归档 {filename} 到 {archive_name}")

            logger.info(f"归档完成，共处理 {archived_files} 个文件")
            return True
        except Exception as e:
            logger.error(f"归档过程中出错: {str(e)}")
            return False

    def compare_with_last_run(self, current_data):
        """与上次运行的数据比较"""
        if self.last_data is None:
            logger.info("没有上次运行的数据可供比较")
            return None

        try:
            merged = pd.merge(
                self.last_data,
                current_data,
                on=['unique_id'],
                how='outer',
                suffixes=('_last', '_current'),
                indicator=True
            )

            changes = {
                'added': merged[merged['_merge'] == 'right_only'].copy(),
                'deleted': merged[merged['_merge'] == 'left_only'].copy(),
                'modified': pd.DataFrame()
            }

            common = merged[merged['_merge'] == 'both'].copy()

            for col in ['label', 'type']:
                if f"{col}_last" in common.columns and f"{col}_current" in common.columns:
                    common.loc[:, f"{col}_status"] = 'both'
                    mask = common[f"{col}_last"] != common[f"{col}_current"]
                    if mask.any():
                        modified = common.loc[mask].copy()
                        modified.loc[:, 'changed_field'] = col
                        modified.loc[:, 'old_value'] = modified[f"{col}_last"]
                        modified.loc[:, 'new_value'] = modified[f"{col}_current"]
                        modified.loc[:, 'change_status'] = 'update'
                        changes['modified'] = pd.concat([changes['modified'], modified])

            return changes
        except Exception as e:
            logger.error(f"数据比较失败: {str(e)}")
            return None

    def save_changes_to_csv(self, changes, all_app_id, all_entries):
        """将变更数据保存到CSV文件"""
        try:
            result_rows = []

            if not changes['added'].empty:
                for _, row in changes['added'].iterrows():
                    app_name = all_app_id.loc[all_app_id['app_id'] == row['app_id_current'], 'name'].values[0] \
                        if not all_app_id[all_app_id['app_id'] == row['app_id_current']].empty else '未知应用'

                    entry_name = all_entries.loc[(all_entries['app_id'] == row['app_id_current']) &
                                                 (all_entries['entry_id'] == row['entry_id_current']), 'name'].values[0] \
                        if not all_entries[(all_entries['app_id'] == row['app_id_current']) &
                                           (all_entries['entry_id'] == row['entry_id_current'])].empty else '未知表单'

                    result_rows.append({
                        '程序执行时间': self.execution_time,
                        'unique_id': row['unique_id'],
                        'app_id': row['app_id_current'],
                        'app_name': app_name,
                        'entry_id': row['entry_id_current'],
                        'entry_name': entry_name,
                        'change_type': '新增',
                        '具体内容': f"新增字段: {row['label_current']}"
                    })

            if not changes['deleted'].empty:
                for _, row in changes['deleted'].iterrows():
                    app_name = all_app_id.loc[all_app_id['app_id'] == row['app_id_last'], 'name'].values[0] \
                        if not all_app_id[all_app_id['app_id'] == row['app_id_last']].empty else '未知应用'

                    entry_name = all_entries.loc[(all_entries['app_id'] == row['app_id_last']) &
                                                 (all_entries['entry_id'] == row['entry_id_last']), 'name'].values[
                        0] \
                        if not all_entries[(all_entries['app_id'] == row['app_id_last']) &
                                           (all_entries['entry_id'] == row['entry_id_last'])].empty else '未知表单'

                    result_rows.append({
                        '程序执行时间': self.execution_time,
                        'unique_id': row['unique_id'],
                        'app_id': row['app_id_last'],
                        'app_name': app_name,
                        'entry_id': row['entry_id_last'],
                        'entry_name': entry_name,
                        'change_type': '删除',
                        '具体内容': f"删除字段: {row['label_last']}"
                    })

            if not changes['modified'].empty:
                modified_df = changes['modified'][changes['modified']['change_status'] == 'update']
                for _, row in modified_df.iterrows():
                    app_name = all_app_id.loc[all_app_id['app_id'] == row['app_id_current'], 'name'].values[0] \
                        if not all_app_id[all_app_id['app_id'] == row['app_id_current']].empty else '未知应用'

                    entry_name = all_entries.loc[(all_entries['app_id'] == row['app_id_current']) &
                                                 (all_entries['entry_id'] == row['entry_id_current']), 'name'].values[0] \
                        if not all_entries[(all_entries['app_id'] == row['app_id_current']) &
                                           (all_entries['entry_id'] == row['entry_id_current'])].empty else '未知表单'

                    result_rows.append({
                        '程序执行时间': self.execution_time,
                        'unique_id': row['unique_id'],
                        'app_id': row['app_id_current'],
                        'app_name': app_name,
                        'entry_id': row['entry_id_current'],
                        'entry_name': entry_name,
                        'change_type': '修改',
                        '具体内容': f"由\"{row['old_value']}\"修改为\"{row['new_value']}\""
                    })

            if result_rows:
                result_df = pd.DataFrame(result_rows)
                changes_file = get_system_agnostic_path((os.path.join(output_dir, f"{DATA_DIR}.csv")), CHANGES_FILE)

                if os.path.exists(changes_file):
                    result_df.to_csv(changes_file, mode='a', header=False, index=False, encoding='utf-8-sig')
                else:
                    result_df.to_csv(changes_file, index=False, encoding='utf-8-sig')

                logger.info(f"变更数据已保存到 {changes_file}")
                return True
            else:
                logger.info("没有检测到任何变更，不生成变更文件")
                return False

        except Exception as e:
            logger.error(f"保存变更数据到CSV失败: {str(e)}", exc_info=True)
            return False

    def run_daily_snapshot(self):
        """执行每日数据快照任务"""
        logger.info("=== 开始每日数据快照任务 ===")

        try:
            logger.info("获取应用数据...")
            app_df = self.fetch_app_data()
            logger.info(f"获取到 {len(app_df)} 个应用")

            logger.info("获取表单数据...")
            entry_df = self.fetch_entry_data(app_df)
            if entry_df is None:
                raise RuntimeError("没有获取到表单数据")
            logger.info(f"获取到 {len(entry_df)} 个表单")

            logger.info("获取字段数据...")
            widget_df = self.fetch_widget_data(entry_df)
            if widget_df is None:
                raise RuntimeError("没有获取到字段数据")
            logger.info(f"获取到 {len(widget_df)} 个字段")

            # 保存完整字段数据
            logger.info("保存完整字段数据...")
            if not self.save_all_widgets_data(widget_df):
                raise RuntimeError("保存完整字段数据失败")

            logger.info("获取待监控表单数据...")
            data_list = self.fetch_monitor_data()
            logger.info("待监控数据获取成功")

            logger.info("匹配字段数据...")
            matched_data = self.match_widget_data(data_list, widget_df)
            logger.info(f"匹配完成，共找到 {len(matched_data)} 条记录")

            logger.info("保存今日数据快照...")
            if not self.save_daily_snapshot(matched_data):
                raise RuntimeError("保存今日快照失败")

            logger.info("归档旧数据...")
            if not self.archive_old_snapshots():
                raise RuntimeError("归档旧数据失败")

            # 保存当前数据用于后续比较
            self.last_data = matched_data.copy()
            self.last_widget_data = widget_df.copy()

            logger.info("=== 每日数据快照任务成功完成 ===")
            return True
        except Exception as e:
            logger.error(f"每日快照任务执行失败: {str(e)}", exc_info=True)
            return False

    def run_hourly_check(self):
        """执行每小时数据检查任务"""
        logger.info("=== 开始每小时数据检查任务 ===")

        try:
            logger.info("获取应用数据...")
            app_df = self.fetch_app_data()
            logger.info(f"获取到 {len(app_df)} 个应用")

            logger.info("获取表单数据...")
            entry_df = self.fetch_entry_data(app_df)
            if entry_df is None:
                raise RuntimeError("没有获取到表单数据")
            logger.info(f"获取到 {len(entry_df)} 个表单")

            logger.info("获取字段数据...")
            widget_df = self.fetch_widget_data(entry_df)
            if widget_df is None:
                raise RuntimeError("没有获取到字段数据")
            logger.info(f"获取到 {len(widget_df)} 个字段")

            logger.info("获取待监控表单数据...")
            data_list = self.fetch_monitor_data()
            logger.info("待监控数据获取成功")

            logger.info("匹配字段数据...")
            current_data = self.match_widget_data(data_list, widget_df)
            logger.info(f"匹配完成，共找到 {len(current_data)} 条记录")

            logger.info("比较数据变化...")
            changes = self.compare_with_last_run(current_data)

            if changes is None:
                logger.info("没有可比较的数据变更")
                return True

            if not changes or not any(len(v) > 0 for v in changes.values()):
                logger.info("没有检测到任何变更")
                return True

            if not self.save_changes_to_csv(changes, app_df, entry_df):
                raise RuntimeError("保存变更数据失败")

            # 更新上次数据为当前数据
            self.last_data = current_data.copy()
            self.last_widget_data = widget_df.copy()

            logger.info("=== 每小时数据检查任务成功完成 ===")
            return True
        except Exception as e:
            logger.error(f"每小时检查任务执行失败: {str(e)}", exc_info=True)
            return False

    def run(self):
        """执行完整的数据监控流程"""
        logger.info(f"=== 开始数据监控任务 ({self.execution_time}) ===")

        # 判断是否是今天的第一次运行（在指定时间范围内）
        if is_first_run_today():
            logger.info("检测到是今天的第一次运行，执行每日数据快照任务")
            success = self.run_daily_snapshot()
        else:
            logger.info("执行每小时数据检查任务")
            success = self.run_hourly_check()

        if not success:
            logger.error("=== 数据监控任务执行失败 ===")
            return False

        logger.info("=== 数据监控任务成功完成 ===")
        return True


if __name__ == "__main__":
    # 创建监控实例并执行
    monitor = DataMonitor()
    if not monitor.run():
        sys.exit(1)