# xgboost修改版本
import os
import pickle
import pandas as pd
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
import tkinter as tk
import tkinter.font as tkfont
from tkinter import ttk
from datetime import timedelta
from time import time
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
from xgboost import XGBRegressor
from lunardate import LunarDate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler  # 添加StandardScaler
from sklearn.ensemble import RandomForestRegressor  # 添加RandomForest作为备选模型
from xgboost import XGBRFRegressor  # 添加XGBoost的随机森林变种
import matplotlib
from scipy.signal import savgol_filter
import matplotlib.dates as mdates

# 配置 matplotlib 中文显示
matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS']
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams['font.family'] = 'sans-serif'

# 全局缓存变量及特征名称 
cached_model = None
last_training_time = None
feature_columns = None
current_view = {'xlim': None, 'ylim': None}  # 用于存储当前图表视图
prediction_mode = "青龙港-陈行"  # 默认预测模式
current_df = None  # 当前使用的数据集


# 定义改进的盐度数据异常过滤方法
def filter_salinity_anomalies(df, threshold_ratio=0.5, window_size=5, max_days=1):
    # 复制数据，避免修改原始数据
    filtered_df = df.copy()
    
    # 确保能访问到日期信息（由于日期已设置为索引）
    values = filtered_df['Value'].values
    dates = filtered_df.index.values  # 从索引获取日期
    
    # 1. 首先处理单个异常点
    i = 1
    while i < len(values):
        # 检查当前值是否小于前一个值的threshold_ratio
        if values[i] < values[i-1] * threshold_ratio:
            baseline = values[i-1]  # 基准值为上一个正常的盐度值
            anomaly_start = i
            j = i
            
            # 向后查找，直到找到一个不小于基准值threshold_ratio的点
            # 或者直到时间区间超过max_days天
            anomaly_start_date = dates[anomaly_start]
            max_date = anomaly_start_date + np.timedelta64(int(max_days*24), 'h')
            
            while j < len(values) and values[j] < baseline * threshold_ratio and dates[j] <= max_date:
                j += 1
            
            anomaly_end = j - 1  # 异常区间的结束位置
            
            # 处理异常区间
            if anomaly_end - anomaly_start < 3:  # 短区间用线性插值
                if j < len(values):
                    # 如果异常区间后还有数据点，使用线性插值
                    for k in range(anomaly_start, anomaly_end + 1):
                        # 线性插值：在基准值和异常区间后第一个正常值之间进行平滑过渡
                        ratio = (k - anomaly_start + 1) / (anomaly_end - anomaly_start + 2)
                        values[k] = baseline * (1 - ratio) + values[j] * ratio
                        # 确保平滑后的值不低于基准的threshold_ratio
                        values[k] = max(values[k], baseline * threshold_ratio)
                else:
                    # 如果异常区间到数据末尾，使用基准值的threshold_ratio填充
                    for k in range(anomaly_start, anomaly_end + 1):
                        values[k] = baseline * threshold_ratio
            else:  # 长区间使用更简单的平滑方式，避免插值错误
                # 使用线性插值来避免非有限值问题
                if j < len(values):
                    end_val = values[j]
                    # 为每个点创建线性插值
                    for k in range(anomaly_start, anomaly_end + 1):
                        fraction = (k - anomaly_start) / (j - anomaly_start) if j > anomaly_start else 0
                        interpolated = baseline * (1 - fraction) + end_val * fraction
                        values[k] = max(interpolated, baseline * threshold_ratio)
                else:
                    # 如果异常区间到数据末尾，使用基准值的threshold_ratio填充
                    for k in range(anomaly_start, anomaly_end + 1):
                        values[k] = baseline * threshold_ratio
            
            i = j  # 跳过已处理的异常区间
        else:
            i += 1
    
    # 2. 应用Savitzky-Golay滤波进行整体平滑
    if len(values) > window_size:
        # 确保window_size是奇数
        if window_size % 2 == 0:
            window_size += 1
        
        # 应用Savitzky-Golay滤波
        try:
            # 对数据进行平滑，但保留原始的特性
            smoothed = savgol_filter(values, window_size, 3)
            
            # 确保平滑后的数据不会小于相邻点的threshold_ratio
            for i in range(1, len(smoothed)):
                smoothed[i] = max(smoothed[i], smoothed[i-1] * threshold_ratio)
            
            values = smoothed
        except Exception as e:
            print(f"Savitzky-Golay滤波应用失败: {e}")
    
    filtered_df['Value'] = values
    return filtered_df


# 数据加载与预处理函数
# -------------------------------
def load_data(upstream_file, downstream_file, river_level_file=None, flow_file=None, source_name="青龙港"):
    """
    加载所有相关数据并进行数据质量处理
    """
    try:
        # 读取上游和下游数据
        upstream_df = pd.read_csv(upstream_file)
        downstream_df = pd.read_csv(downstream_file)
    except FileNotFoundError:
        print("文件未找到，请检查路径")
        return None

    # 确保列名一致
    upstream_df.columns = ['DateTime', 'TagName', 'Value']
    downstream_df.columns = ['DateTime', 'TagName', 'Value']

    # 转换时间格式并设置为索引
    upstream_df['DateTime'] = pd.to_datetime(upstream_df['DateTime'])
    downstream_df['DateTime'] = pd.to_datetime(downstream_df['DateTime'])

    # 设置DateTime为索引
    upstream_df.set_index('DateTime', inplace=True)
    downstream_df.set_index('DateTime', inplace=True)

    # 应用盐度数据异常过滤方法s
    upstream_df = filter_salinity_anomalies(upstream_df, threshold_ratio=0.5, window_size=7, max_days=1)
    downstream_df = filter_salinity_anomalies(downstream_df, threshold_ratio=0.5, window_size=7, max_days=1)
    
    # 处理低盐度值（小于5）
    # 不直接过滤，而是标记为NaN并使用插值方法处理
    for df in [upstream_df, downstream_df]:
        # 标记低盐度值为NaN
        low_salinity_mask = df['Value'] < 5
        if low_salinity_mask.any():
            print(f"发现{low_salinity_mask.sum()}个低盐度值（<5），将使用插值处理")
            df.loc[low_salinity_mask, 'Value'] = np.nan
            
            # 对短期缺失使用线性插值
            df['Value'] = df['Value'].interpolate(method='linear', limit=4)
            
            # 对较长期缺失使用基于时间的插值
            df['Value'] = df['Value'].interpolate(method='time', limit=24)
            
            # 对剩余缺失使用前向和后向填充
            df['Value'] = df['Value'].fillna(method='ffill').fillna(method='bfill')
            
            # 使用更小的窗口进行平滑处理
            df['Value'] = df['Value'].rolling(window=6, center=True, min_periods=1).median()
 

    # 重命名Value列
    upstream_df = upstream_df.rename(columns={'Value': 'upstream'})[['upstream']]
    downstream_df = downstream_df.rename(columns={'Value': 'downstream'})[['downstream']]

    # 合并数据
    merged_df = pd.merge(upstream_df, downstream_df, left_index=True, right_index=True, how='inner')
    
    # 记录数据源名称
    merged_df['source_name'] = source_name

    # 加载长江水位数据
    if river_level_file:
        try:
            river_level_df = pd.read_csv(river_level_file)
            print(f"成功读取水位数据文件: {river_level_file}")
            
            # 确保列名一致
            if len(river_level_df.columns) >= 3:
                river_level_df.columns = ['DateTime', 'TagName', 'Value']
            elif len(river_level_df.columns) == 2:
                river_level_df.columns = ['DateTime', 'Value']
                river_level_df['TagName'] = 'water_level'
            
            # 数据处理
            river_level_df['DateTime'] = pd.to_datetime(river_level_df['DateTime'])
            river_level_df.set_index('DateTime', inplace=True)
            river_level_df['Value'] = pd.to_numeric(river_level_df['Value'], errors='coerce')
            
            # 使用IQR方法处理异常值
            Q1 = river_level_df['Value'].quantile(0.25)
            Q3 = river_level_df['Value'].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            river_level_df.loc[river_level_df['Value'] < lower_bound, 'Value'] = lower_bound
            river_level_df.loc[river_level_df['Value'] > upper_bound, 'Value'] = upper_bound
            
            # 重命名并保留需要的列
            river_level_df = river_level_df.rename(columns={'Value': 'water_level'})[['water_level']]
            
            # 合并到主数据框
            merged_df = pd.merge(merged_df, river_level_df, left_index=True, right_index=True, how='left')
            
            # 对水位数据进行插值处理
            merged_df['water_level'] = merged_df['water_level'].interpolate(method='time', limit=24)
            merged_df['water_level'] = merged_df['water_level'].fillna(method='ffill').fillna(method='bfill')
            
            # 创建平滑的水位数据
            merged_df['water_level_smooth'] = merged_df['water_level'].rolling(window=24, min_periods=1, center=True).mean()
            
            # 添加水位趋势特征
            merged_df['water_level_trend_1h'] = merged_df['water_level_smooth'].diff(1)
            merged_df['water_level_trend_24h'] = merged_df['water_level_smooth'].diff(24)
            
            print(f"水位数据加载成功，范围: {merged_df['water_level'].min()} - {merged_df['water_level'].max()}")
        except Exception as e:
            print(f"水位数据加载失败: {str(e)}")

    # 加载大通流量数据
    if flow_file:
        try:
            flow_df = pd.read_csv(flow_file)
            print(f"成功读取流量数据文件: {flow_file}")
            
            # 确保列名一致
            if len(flow_df.columns) >= 3:
                flow_df.columns = ['DateTime', 'TagName', 'Value']
            elif len(flow_df.columns) == 2:
                flow_df.columns = ['DateTime', 'Value']
                flow_df['TagName'] = 'flow'
            
            # 数据处理
            flow_df['DateTime'] = pd.to_datetime(flow_df['DateTime'])
            flow_df.set_index('DateTime', inplace=True)
            flow_df['Value'] = pd.to_numeric(flow_df['Value'], errors='coerce')
            
            # 使用IQR方法处理异常值
            Q1 = flow_df['Value'].quantile(0.25)
            Q3 = flow_df['Value'].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            flow_df.loc[flow_df['Value'] < lower_bound, 'Value'] = lower_bound
            flow_df.loc[flow_df['Value'] > upper_bound, 'Value'] = upper_bound
            
            # 重命名并保留需要的列
            flow_df = flow_df.rename(columns={'Value': 'flow'})[['flow']]
            
            # 合并到主数据框
            merged_df = pd.merge(merged_df, flow_df, left_index=True, right_index=True, how='left')
            
            # 对流量数据进行插值处理
            merged_df['flow'] = merged_df['flow'].interpolate(method='time', limit=24)
            merged_df['flow'] = merged_df['flow'].fillna(method='ffill').fillna(method='bfill')
            
            # 创建平滑的流量数据
            merged_df['flow_smooth'] = merged_df['flow'].rolling(window=24, min_periods=1, center=True).mean()
            
            # 添加流量趋势特征
            merged_df['flow_trend_1h'] = merged_df['flow_smooth'].diff(1)
            merged_df['flow_trend_24h'] = merged_df['flow_smooth'].diff(24)
            
            # 添加流量统计特征
            merged_df['mean_1d_flow'] = merged_df['flow_smooth'].rolling(window=24, min_periods=1).mean()
            merged_df['mean_3d_flow'] = merged_df['flow_smooth'].rolling(window=72, min_periods=1).mean()
            merged_df['std_1d_flow'] = merged_df['flow_smooth'].rolling(window=24, min_periods=1).std()
            
            # 添加流量变化特征
            merged_df['flow_change_1h'] = merged_df['flow_smooth'].diff(1)
            merged_df['flow_change_24h'] = merged_df['flow_smooth'].diff(24)
            
            print(f"流量数据加载成功，范围: {merged_df['flow'].min()} - {merged_df['flow'].max()} m³/s")
        except Exception as e:
            print(f"流量数据加载失败: {str(e)}")

    # 对盐度数据进行插值和平滑处理
    merged_df['upstream'] = merged_df['upstream'].interpolate(method='time', limit=24)
    merged_df['downstream'] = merged_df['downstream'].interpolate(method='time', limit=24)

    # 使用前向后向填充处理剩余的NaN值
    merged_df['upstream'] = merged_df['upstream'].ffill().bfill()
    merged_df['downstream'] = merged_df['downstream'].ffill().bfill()

    # 创建平滑的盐度数据
    merged_df['upstream_smooth'] = merged_df['upstream'].rolling(window=24, min_periods=1, center=True).mean()
    merged_df['downstream_smooth'] = merged_df['downstream'].rolling(window=24, min_periods=1, center=True).mean()

    # 添加趋势特征
    merged_df['upstream_trend_1h'] = merged_df['upstream_smooth'].diff(1)
    merged_df['upstream_trend_24h'] = merged_df['upstream_smooth'].diff(24)
    merged_df['downstream_trend_1h'] = merged_df['downstream_smooth'].diff(1)
    merged_df['downstream_trend_24h'] = merged_df['downstream_smooth'].diff(24)

    # 填充NaN值
    merged_df['upstream_trend_1h'] = merged_df['upstream_trend_1h'].fillna(0)
    merged_df['upstream_trend_24h'] = merged_df['upstream_trend_24h'].fillna(0)
    merged_df['downstream_trend_1h'] = merged_df['downstream_trend_1h'].fillna(0)
    merged_df['downstream_trend_24h'] = merged_df['downstream_trend_24h'].fillna(0)

    # 对低盐度部分使用更大的窗口进行平滑
    low_sal_mask = merged_df['upstream'] < 50
    if low_sal_mask.any():
        merged_df.loc[low_sal_mask, 'upstream_smooth'] = merged_df.loc[low_sal_mask, 'upstream']\
            .rolling(window=48, min_periods=1, center=True).mean()

    # 数据验证和统计
    print("\n数据质量统计:")
    print(f"总数据量: {len(merged_df)}")
    print(f"上游({source_name})盐度范围: {merged_df['upstream_smooth'].min():.2f} - {merged_df['upstream_smooth'].max():.2f}")
    print(f"下游(陈行)盐度范围: {merged_df['downstream_smooth'].min():.2f} - {merged_df['downstream_smooth'].max():.2f}")
    
    if 'water_level' in merged_df.columns:
        print(f"水位范围: {merged_df['water_level_smooth'].min():.2f} - {merged_df['water_level_smooth'].max():.2f}")
        print(f"水位缺失比例: {merged_df['water_level_smooth'].isna().mean()*100:.2f}%")
    
    if 'flow' in merged_df.columns:
        print(f"流量范围: {merged_df['flow_smooth'].min():.2f} - {merged_df['flow_smooth'].max():.2f} m³/s")
        print(f"流量缺失比例: {merged_df['flow_smooth'].isna().mean()*100:.2f}%")

    # 重置索引，将DateTime作为列
    merged_df = merged_df.reset_index()

    return merged_df

def resample_to_hourly(df):
    """
    将分钟级数据重采样为小时级数据，计算每小时的平均值
    """
    try:
        # 确保DateTime是索引
        if 'DateTime' in df.columns:
            df = df.set_index('DateTime')
        
        # 获取所有数值列
        numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
        
        # 按小时重采样，计算平均值
        hourly_df = df[numeric_columns].resample('H').mean()
        
        # 重置索引，将DateTime作为列
        hourly_df = hourly_df.reset_index()
        
        print(f"数据已从分钟级重采样为小时级，原始数据行数: {len(df)}，重采样后行数: {len(hourly_df)}")
        return hourly_df
        
    except Exception as e:
        print(f"重采样数据异常: {e}")
        return df


# # 测试
# df = load_data('yuce_data/青龙港1.csv', 'yuce_data/一取水.csv')
#  # 将数据重采样为小时级
# df = resample_to_hourly(df)
# df.to_csv('merged_data_hour.csv', index=False)
# print(f"Merged data saved to 'merged_data_hour.csv' successfully")

# # 绘制盐度随时间变化图
# plt.figure(figsize=(12, 6))
# plt.plot(df['DateTime'], df['upstream_smooth'], label='上游盐度', color='blue')
# plt.plot(df['DateTime'], df['downstream_smooth'], label='下游盐度', color='red')
# plt.axhline(y=250, color='red', linestyle='--', alpha=0.7, linewidth=1.5, label='盐度警戒线 (250)')
# plt.xlabel('时间')
# plt.ylabel('盐度')
# plt.title('盐度随时间变化图')
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.savefig('salinity_time_series.png', dpi=300)
# plt.show()

# -------------------------------
# 添加农历（潮汐）特征
# -------------------------------
def add_lunar_features(df):
    lunar_day, lunar_phase_sin, lunar_phase_cos, is_high_tide = [], [], [], []
    # 新增潮汐权重特征
    tide_weight, tide_period_type = [], []
    
    for dt in df['DateTime']:
        ld = LunarDate.fromSolarDate(dt.year, dt.month, dt.day)
        lunar_day.append(ld.day)
        
        # 基础潮汐周期特征（正弦和余弦变换）
        lunar_phase_sin.append(np.sin(2 * np.pi * ld.day / 15))
        lunar_phase_cos.append(np.cos(2 * np.pi * ld.day / 15))
        
        # 大潮期（农历初一至初五及十六至二十）标记为1，其他为0
        is_high = 1 if (ld.day <= 5 or (ld.day >= 16 and ld.day <= 20)) else 0
        is_high_tide.append(is_high)
        
        # 增加潮汐周期类型
        # 1: 大活汛期（农历初一至初五及十六至二十）
        # 2: 死汛期前段（农历初六至十五）
        # 3: 死汛期后段（农历二十一至月末）
        if ld.day <= 5 or (ld.day >= 16 and ld.day <= 20):
            period_type = 1  # 大活汛期
        elif ld.day >= 6 and ld.day <= 15:
            period_type = 2  # 死汛期前段
        else:
            period_type = 3  # 死汛期后段
        tide_period_type.append(period_type)
        
        # 根据潮汐周期类型和具体农历日分配权重
        # 大活汛期权重较高，死汛期权重较低
        if period_type == 1:  # 大活汛期
            # 初一/十六权重最高，递减至初五/二十
            if ld.day <= 5:
                weight = 1.0 - (ld.day - 1) * 0.1  # 1.0, 0.9, 0.8, 0.7, 0.6
            else:
                weight = 1.0 - (ld.day - 16) * 0.1  # 1.0, 0.9, 0.8, 0.7, 0.6
        elif period_type == 2:  # 死汛期前段
            # 权重稍低且平缓变化
            weight = 0.5 - (ld.day - 6) * 0.02  # 从0.5慢慢降到0.3
        else:  # 死汛期后段
            # 权重稍低且平缓变化
            weight = 0.5 - (ld.day - 21) * 0.02  # 从0.5慢慢降到0.3
            
        tide_weight.append(weight)
    
    # 添加原有特征
    df['lunar_day'] = lunar_day
    df['lunar_phase_sin'] = lunar_phase_sin
    df['lunar_phase_cos'] = lunar_phase_cos
    df['is_high_tide'] = is_high_tide
    
    # 添加新的潮汐特征
    df['tide_weight'] = tide_weight
    df['tide_period_type'] = tide_period_type
    
    # 添加潮汐45分钟延迟特征
    # 每天潮汐时间后延约45分钟，对应角度变化约11.25度（360度/24小时*0.75小时）
    hour_values = df['DateTime'].dt.hour + df['DateTime'].dt.minute / 60.0
    
    # 计算潮汐时间延迟的周期性特征
    # 将lunar_day转换为numpy数组进行计算
    lunar_day_array = np.array(lunar_day)
    
    # 基于农历日与时间的组合，表示具体某天某时的潮汐状态
    df['tide_time_sin'] = np.sin(2 * np.pi * (hour_values / 12 + lunar_day_array * 0.75 / 12))
    df['tide_time_cos'] = np.cos(2 * np.pi * (hour_values / 12 + lunar_day_array * 0.75 / 12))
    
    # 添加潮汐强度与盐度相关性的特征
    # 组合潮汐权重与时间特征
    df['tide_salt_factor'] = df['tide_weight'] * (1 + 0.5 * np.sin(2 * np.pi * hour_values / 12))
    
    return df


# -------------------------------
# 生成延迟特征（向量化）
# -------------------------------
def batch_create_delay_features(df, delay_hours):
    """
    为数据框中的特定列创建延迟特征
    """
    # 定义需要创建延迟特征的列
    target_columns = ['upstream_smooth']
    

 # target_columns = ['upstream_smooth', 'downstream_smooth']
    # # 如果存在水位数据列，也为它创建延迟特征  暂时不使用
    # if 'water_level_smooth' in df.columns:
    #     target_columns.append('water_level_smooth')
    # elif 'water_level' in df.columns:
    #     print("注意: 水位平滑列不存在，使用原始水位列创建延迟特征")
    #     # 创建水位平滑列
    #     df['water_level_smooth'] = df['water_level'].rolling(window=24, min_periods=1, center=True).mean()
    #     df['water_level_smooth'] = df['water_level_smooth'].fillna(df['water_level'])
    #     target_columns.append('water_level_smooth')

    # 创建延迟特征
    for column in target_columns:
        if column in df.columns:
            for delay in delay_hours:
                df[f'{column.split("_")[0]}_delay_{delay}h'] = df[column].shift(delay)
        else:
            print(f"警告: 列 {column} 不存在，跳过创建延迟特征")
            
    return df


# 生成其他特征
def generate_features(df):
    """
    生成其他特征，包括历史数据、时间特征、统计特征和外部特征，并将这些特征添加到原始DataFrame中
    """
    try:
        # 创建平滑的盐度数据
        df['upstream_smooth'] = df['upstream'].rolling(window=24, min_periods=1, center=True).mean()
        df['downstream_smooth'] = df['downstream'].rolling(window=24, min_periods=1, center=True).mean()
        
        # 时间特征
        df['hour'] = df['DateTime'].dt.hour
        df['weekday'] = df['DateTime'].dt.dayofweek
        df['month'] = df['DateTime'].dt.month
        
        # 时间特征的sin和cos转换
        df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
        df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
        df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
        df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
        
        # 统计特征
        df['mean_1d_up'] = df['upstream_smooth'].rolling(window=24, min_periods=1).mean()
        df['mean_3d_up'] = df['upstream_smooth'].rolling(window=72, min_periods=1).mean()
        df['std_1d_up'] = df['upstream_smooth'].rolling(window=24, min_periods=1).std()
        
        df['mean_1d_down'] = df['downstream_smooth'].rolling(window=24, min_periods=1).mean()
        df['mean_3d_down'] = df['downstream_smooth'].rolling(window=72, min_periods=1).mean()
        df['std_1d_down'] = df['downstream_smooth'].rolling(window=24, min_periods=1).std()
        
        # 趋势特征
        df['trend_1h_up'] = df['upstream_smooth'].diff(1)
        df['trend_3h_up'] = df['upstream_smooth'].diff(3)
        df['trend_6h_up'] = df['upstream_smooth'].diff(6)
        df['trend_12h_up'] = df['upstream_smooth'].diff(12)
        df['trend_24h_up'] = df['upstream_smooth'].diff(24)
        
        df['trend_1h_down'] = df['downstream_smooth'].diff(1)
        df['trend_3h_down'] = df['downstream_smooth'].diff(3)
        df['trend_6h_down'] = df['downstream_smooth'].diff(6)
        df['trend_12h_down'] = df['downstream_smooth'].diff(12)
        df['trend_24h_down'] = df['downstream_smooth'].diff(24)
        
        # 外部特征（水位和流量）
        if 'water_level_smooth' in df.columns:
            df['water_level_trend_1h'] = df['water_level_smooth'].diff(1)
            df['water_level_trend_24h'] = df['water_level_smooth'].diff(24)
            df['mean_1d_water_level'] = df['water_level_smooth'].rolling(window=24, min_periods=1).mean()
            df['mean_3d_water_level'] = df['water_level_smooth'].rolling(window=72, min_periods=1).mean()
            df['std_1d_water_level'] = df['water_level_smooth'].rolling(window=24, min_periods=1).std()
        
        if 'flow_smooth' in df.columns:
            df['flow_trend_1h'] = df['flow_smooth'].diff(1)
            df['flow_trend_24h'] = df['flow_smooth'].diff(24)
            df['mean_1d_flow'] = df['flow_smooth'].rolling(window=24, min_periods=1).mean()
            df['mean_3d_flow'] = df['flow_smooth'].rolling(window=72, min_periods=1).mean()
            df['std_1d_flow'] = df['flow_smooth'].rolling(window=24, min_periods=1).std()
            
        # 新增：增强短期连续性特征 - 为提高第一天预测准确性
        # 最近几小时的细粒度盐度变化特征
        for h in [1, 2, 3, 4, 6, 8, 12]:
            # 分别计算上游和下游盐度的近期变化率
            df[f'recent_{h}h_rate_down'] = (df['downstream_smooth'] - df['downstream_smooth'].shift(h)) / h
            df[f'recent_{h}h_rate_up'] = (df['upstream_smooth'] - df['upstream_smooth'].shift(h)) / h
        
        # 最近24小时的加权移动平均，赋予最近数据更高权重
        weights_24h = np.linspace(0.1, 1.0, 24)  # 线性递增权重
        df['weighted_24h_down'] = df['downstream_smooth'].rolling(window=24).apply(
            lambda x: np.sum(x * weights_24h[:len(x)]) / np.sum(weights_24h[:len(x)]), raw=True
        )
        df['weighted_24h_up'] = df['upstream_smooth'].rolling(window=24).apply(
            lambda x: np.sum(x * weights_24h[:len(x)]) / np.sum(weights_24h[:len(x)]), raw=True
        )
        
        # 最近数小时的指数平滑特征
        alpha = 0.3  # 平滑因子
        df['exp_smooth_down'] = df['downstream_smooth'].ewm(alpha=alpha, adjust=False).mean()
        df['exp_smooth_up'] = df['upstream_smooth'].ewm(alpha=alpha, adjust=False).mean()
        
        # 最近变化趋势的稳定性/波动性特征
        df['trend_stability_12h_down'] = df['trend_1h_down'].rolling(window=12).std() / df['downstream_smooth'].rolling(window=12).mean()
        df['trend_stability_12h_up'] = df['trend_1h_up'].rolling(window=12).std() / df['upstream_smooth'].rolling(window=12).mean()
        
        # 计算短期趋势的加速度（变化率的变化率）
        df['trend_acceleration_down'] = df['trend_1h_down'].diff(1)
        df['trend_acceleration_up'] = df['trend_1h_up'].diff(1)
        
        return df
        
    except Exception as e:
        print(f"特征生成异常: {e}")
        return df


# -------------------------------
# 主程序入口：加载数据、添加特征、生成延迟特征后启动GUI
# -------------------------------
def save_processed_data(df, filename='processed_data.pkl'):
    try:
        df.to_pickle(filename)
        print(f"已保存处理后的数据到 {filename}")
        return True
    except Exception as e:
        print(f"保存数据失败: {e}")
        return False

def load_processed_data(filename='processed_data.pkl'):
    try:
        if os.path.exists(filename):
            df = pd.read_pickle(filename)
            print(f"已从 {filename} 加载处理后的数据")
            return df
        else:
            print(f"找不到处理后的数据文件 {filename}")
            return None
    except Exception as e:
        print(f"加载数据失败: {e}")
        return None

def load_both_datasets():
    """加载两个数据源的数据集"""
    global cached_model, last_training_time
    
    # 删除旧的处理数据和模型文件，以应用修复后的代码
    for file in ['processed_data_qinglong.pkl', 'processed_data_taicang.pkl', 
                'salinity_model_qinglong.pkl', 'salinity_model_taicang.pkl']:
        if os.path.exists(file):
            try:
                os.remove(file)
                print(f"已删除旧的文件: {file}")
            except Exception as e:
                print(f"删除文件失败: {file} - {e}")
    
    # 加载青龙港-陈行数据集
    qinglong_df = load_processed_data('processed_data_qinglong.pkl')
    if qinglong_df is None:
        # 创建新的数据集
        print("正在处理青龙港-陈行数据集...")
        qinglong_df = load_data('yuce_data\青龙港盐度1.csv', 'yuce_data\陈行第一取水口盐度.csv', 
                              'yuce_data\长江液位.csv', 'yuce_data\大通流量.csv', source_name="青龙港")
       
        # 将数据重采样为小时级
        qinglong_df = resample_to_hourly(qinglong_df)
       
        if qinglong_df is not None:
            # 添加时间特征
            qinglong_df['hour'] = qinglong_df['DateTime'].dt.hour
            qinglong_df['weekday'] = qinglong_df['DateTime'].dt.dayofweek
            qinglong_df['month'] = qinglong_df['DateTime'].dt.month
            
            # 添加农历特征
            qinglong_df = add_lunar_features(qinglong_df)
            
            # 添加延迟特征 (青龙港-陈行: 3-7天)
            delay_hours = [36,39,42,45,48,51,54,57,60,72,78,84,90,96,102,108,114,120,126,132,138,144,150,156,162,168]
            qinglong_df = batch_create_delay_features(qinglong_df, delay_hours)
            
            # 添加统计特征
            qinglong_df = generate_features(qinglong_df)
            
            
            # 保存处理后的数据
            save_processed_data(qinglong_df, 'processed_data_qinglong.pkl')
            print("青龙港-陈行数据集处理完成")
    else:
        print("已从缓存加载青龙港-陈行数据集")
    
    # 加载太仓石化-陈行数据集
    taicang_df = load_processed_data('processed_data_taicang.pkl')
    if taicang_df is None:
        # 创建新的数据集
        print("正在处理太仓石化-陈行数据集...")
        taicang_df = load_data('yuce_data\太仓石化盐度2.csv', 'yuce_data\陈行第一取水口盐度.csv', 
                             'yuce_data\长江液位.csv', 'yuce_data\大通流量.csv', source_name="太仓石化")
        
        # 将数据重采样为小时级
        taicang_df = resample_to_hourly(taicang_df)

        if taicang_df is not None:
            # 添加时间特征
            taicang_df['hour'] = taicang_df['DateTime'].dt.hour
            taicang_df['weekday'] = taicang_df['DateTime'].dt.dayofweek
            taicang_df['month'] = taicang_df['DateTime'].dt.month
            
            # 添加农历特征
            taicang_df = add_lunar_features(taicang_df)
            
            # 添加延迟特征 (太仓石化-陈行: 1-3天)
            delay_hours = [1,2,3,4,5,6,8,10,12,14,16,18,24,30,36,42,48,54,60,66,72]
            taicang_df = batch_create_delay_features(taicang_df, delay_hours)
            
            # 添加统计特征
            taicang_df = generate_features(taicang_df)
            
            
            # 保存处理后的数据
            save_processed_data(taicang_df, 'processed_data_taicang.pkl')
            print("太仓石化-陈行数据集处理完成")
    else:
        print("已从缓存加载太仓石化-陈行数据集")
    
    return qinglong_df, taicang_df


# -------------------------------
# 模型训练与预测，展示验证准确度（RMSE, MAE）
# -------------------------------
def train_and_predict(df, start_time, force_retrain=False):
    global cached_model, last_training_time, prediction_mode
    
    # 根据当前预测模式选择模型缓存文件
    if prediction_mode == "青龙港-陈行":
        model_cache_file = 'salinity_model_qinglong.pkl'
        scaler_cache_file = 'scaler_qinglong.pkl'
    else:  # 太仓石化-陈行
        model_cache_file = 'salinity_model_taicang.pkl'
        scaler_cache_file = 'scaler_taicang.pkl'
        
    model_needs_training = True
    scaler = None  # 初始化特征缩放器
    model_data = None  # 初始化model_data变量

    if os.path.exists(model_cache_file) and force_retrain:
        try:
            os.remove(model_cache_file)
            if os.path.exists(scaler_cache_file):
                os.remove(scaler_cache_file)
            print("已删除旧模型缓存（强制重新训练）")
        except Exception as e:
            print("删除缓存异常:", e)

    # 准备训练数据
    train_df = df[df['DateTime'] < start_time].copy()
    test_X, test_y = create_features_vectorized(train_df, look_back=7, forecast_horizon=1)
    if test_X is None or test_y is None:
        print("特征生成失败")
        return None, None, None, None
        
    current_feature_dim = test_X.shape[1] if len(test_X) > 0 else 0
    print(f"当前特征维度: {current_feature_dim}")
    
    # 检查缓存模型
    cached_feature_dim = None
    if not force_retrain and cached_model is not None and last_training_time is not None:
        if last_training_time >= train_df['DateTime'].max():
            try:
                cached_feature_dim = cached_model['xgb'].n_features_in_
                print(f"缓存模型特征维度: {cached_feature_dim}")
                
                if cached_feature_dim == current_feature_dim:
                    model_needs_training = False
                    print(f"使用缓存模型，训练时间: {last_training_time}")
                    # 加载特征缩放器
                    if os.path.exists(scaler_cache_file):
                        with open(scaler_cache_file, 'rb') as f:
                            scaler = pickle.load(f)
                            print("从缓存加载特征缩放器")
                else:
                    print(f"特征维度不匹配（缓存模型: {cached_feature_dim}，当前: {current_feature_dim}），需要重新训练")
            except Exception as e:
                print(f"检查模型特征维度失败: {e}")
    elif not force_retrain and os.path.exists(model_cache_file):
        try:
            with open(model_cache_file, 'rb') as f:
                model_data = pickle.load(f)
                cached_model = model_data['models']
                last_training_time = model_data['training_time']
                
                try:
                    cached_feature_dim = cached_model['xgb'].n_features_in_
                    print(f"文件缓存模型特征维度: {cached_feature_dim}")
                    if cached_feature_dim == current_feature_dim:
                        if last_training_time >= train_df['DateTime'].max():
                            model_needs_training = False
                            print(f"从文件加载模型，训练时间: {last_training_time}")
                            # 加载特征缩放器
                            if os.path.exists(scaler_cache_file):
                                with open(scaler_cache_file, 'rb') as f:
                                    scaler = pickle.load(f)
                                    print("从文件加载特征缩放器")
                        else:
                            print(f"训练时间不足，需要重新训练")
                            model_needs_training = True
                    else:
                        print(f"特征维度不匹配（文件模型: {cached_feature_dim}，当前: {current_feature_dim}），需要重新训练")
                except Exception as e:
                    print(f"检查模型特征维度失败: {e}")
                    model_needs_training = True
        except Exception as e:
            print("加载模型失败:", e)
            model_needs_training = True

    # 训练新模型
    if model_needs_training:
        print(f"开始训练新{prediction_mode}模型...")
        if len(train_df) < 100:
            print("训练数据不足")
            return None, None, None, None

        start_train = time()
        X, y = create_features_vectorized(train_df, look_back=7, forecast_horizon=1)
        if X is None or y is None or len(X) == 0 or len(y) == 0:
            print("特征生成失败或样本不足")
            return None, None, None, None
            
        print(f"训练样本数量: {X.shape[0]}, 特征维度: {X.shape[1]}")
        
        # 为特征生成名称（用于分析）
        feature_names = generate_feature_names(train_df, X.shape[1])
        
        # 特征归一化处理
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        print("特征已归一化处理")
        
        # 保存特征缩放器
        with open(scaler_cache_file, 'wb') as f:
            pickle.dump(scaler, f)
            print(f"特征缩放器已保存至 {scaler_cache_file}")
        
        # 分割训练集和验证集
        X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.15, random_state=42)
        print(f"训练集: {X_train.shape[0]}样本, 验证集: {X_val.shape[0]}样本")
        
        try:
            # 训练多个模型
            models = {}
            predictions = {}
            metrics = {}
            
            # 1. 训练标准XGBoost回归模型
            print("训练XGBoost模型...")
            xgb_model = XGBRegressor(
                n_estimators=300,
                learning_rate=0.05,
                max_depth=6,
                min_child_weight=2,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0.1,
                reg_alpha=0.1,
                reg_lambda=1.0,
                n_jobs=-1,
                random_state=42,
                early_stopping_rounds=20
            )
            xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', verbose=False)
            models['xgb'] = xgb_model
            
            # 在验证集上评估
            xgb_preds = xgb_model.predict(X_val)
            predictions['xgb'] = xgb_preds
            xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_preds))
            xgb_mae = mean_absolute_error(y_val, xgb_preds)
            metrics['xgb'] = {'rmse': xgb_rmse, 'mae': xgb_mae}
            print(f"XGBoost - RMSE: {xgb_rmse:.4f}, MAE: {xgb_mae:.4f}")
            
            # 2. 训练XGBoost随机森林版本模型
            print("训练XGBoost随机森林模型...")
            xgbrf_model = XGBRFRegressor(
                n_estimators=300,
                max_depth=7,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.05,
                reg_lambda=1.0,
                n_jobs=-1,
                random_state=42
            )
            xgbrf_model.fit(X_train, y_train)
            models['xgbrf'] = xgbrf_model
            
            # 在验证集上评估
            xgbrf_preds = xgbrf_model.predict(X_val)
            predictions['xgbrf'] = xgbrf_preds
            xgbrf_rmse = np.sqrt(mean_squared_error(y_val, xgbrf_preds))
            xgbrf_mae = mean_absolute_error(y_val, xgbrf_preds)
            metrics['xgbrf'] = {'rmse': xgbrf_rmse, 'mae': xgbrf_mae}
            print(f"XGBoost RF - RMSE: {xgbrf_rmse:.4f}, MAE: {xgbrf_mae:.4f}")
            
            # 3. 训练随机森林模型
            print("训练随机森林模型...")
            rf_model = RandomForestRegressor(
                n_estimators=200,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=2,
                max_features='sqrt',
                n_jobs=-1,
                random_state=42
            )
            rf_model.fit(X_train, y_train)
            models['rf'] = rf_model
            
            # 在验证集上评估
            rf_preds = rf_model.predict(X_val)
            predictions['rf'] = rf_preds
            rf_rmse = np.sqrt(mean_squared_error(y_val, rf_preds))
            rf_mae = mean_absolute_error(y_val, rf_preds)
            metrics['rf'] = {'rmse': rf_rmse, 'mae': rf_mae}
            print(f"随机森林 - RMSE: {rf_rmse:.4f}, MAE: {rf_mae:.4f}")
            
            # 计算加权融合权重（基于各模型的RMSE取倒数）
            total_weight = 1/xgb_rmse + 1/xgbrf_rmse + 1/rf_rmse
            weights = {
                'xgb': 1/xgb_rmse / total_weight,
                'xgbrf': 1/xgbrf_rmse / total_weight,
                'rf': 1/rf_rmse / total_weight
            }
            print(f"模型权重: XGBoost={weights['xgb']:.2f}, XGBRF={weights['xgbrf']:.2f}, RF={weights['rf']:.2f}")
            
            # 融合预测
            ensemble_preds = weights['xgb'] * xgb_preds + weights['xgbrf'] * xgbrf_preds + weights['rf'] * rf_preds
            ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_preds))
            ensemble_mae = mean_absolute_error(y_val, ensemble_preds)
            print(f"融合模型 - RMSE: {ensemble_rmse:.4f}, MAE: {ensemble_mae:.4f}")
            
            # 分析特征重要性
            feature_importance = xgb_model.feature_importances_
            sorted_idx = np.argsort(feature_importance)[::-1]
            print(f"\n{prediction_mode}模型 前15重要特征:")
            for i in range(min(15, len(sorted_idx))):
                print(f"{i+1}. {feature_names[sorted_idx[i]]}: {feature_importance[sorted_idx[i]]:.6f}")
            
            # 保存所有模型和权重
            cached_model = models
            last_training_time = start_time
            
            model_data = {
                'models': models,
                'weights': weights,
                'training_time': last_training_time,
                'feature_columns': feature_names,
                'metrics': {
                    'xgb_rmse': xgb_rmse,
                    'xgb_mae': xgb_mae,
                    'ensemble_rmse': ensemble_rmse,
                    'ensemble_mae': ensemble_mae
                },
                'feature_dim': current_feature_dim
            }
            
            with open(model_cache_file, 'wb') as f:
                pickle.dump(model_data, f)
            
            print(f"{prediction_mode}模型训练完成，耗时: {time() - start_train:.2f}秒")
            
        except Exception as e:
            print(f"模型训练异常: {e}")
            import traceback
            traceback.print_exc()
            return None, None, None, None
    else:
        models = cached_model
        # 加载权重和指标
        with open(model_cache_file, 'rb') as f:
            model_data = pickle.load(f)
            weights = model_data.get('weights', {'xgb': 0.6, 'xgbrf': 0.25, 'rf': 0.15})

    # 预测部分
    try:
        if scaler is None and os.path.exists(scaler_cache_file):
            with open(scaler_cache_file, 'rb') as f:
                scaler = pickle.load(f)
                print("预测前加载特征缩放器")
        
        # 初始化预测结果
        future_dates = [start_time + timedelta(days=i) for i in range(5)]
        predictions = np.zeros(5)
        
        # 创建预测特征
        X_pred = []
        for i in range(5):
            current_date = future_dates[i]
            features = generate_prediction_features(df, current_date, look_back=7)
            if features is None:
                print(f"生成预测特征失败: {current_date}")
                return None, None, None, None
            X_pred.append(features)
        
        # 特征归一化
        X_pred = np.array(X_pred)
        if scaler is not None:
            X_pred = scaler.transform(X_pred)
            print("预测特征已归一化")
        
        # 使用多模型预测
        model_predictions = {}
        for model_name, model in models.items():
            model_predictions[model_name] = model.predict(X_pred)
        
        # 模型融合
        ensemble_predictions = np.zeros(len(future_dates))
        for model_name, weight in weights.items():
            ensemble_predictions += weight * model_predictions[model_name]
        
        # 计算预测的置信区间
        train_std = 10.0  # 默认误差估计
        if model_data and 'metrics' in model_data and 'ensemble_rmse' in model_data['metrics']:
            train_std = model_data['metrics']['ensemble_rmse']
        
        prediction_intervals = np.array([
            ensemble_predictions - 1.96 * train_std,
            ensemble_predictions + 1.96 * train_std
        ])
        
        # 打印各模型的预测结果
        print("\n各模型预测结果:")
        for date, xgb_pred, xgbrf_pred, rf_pred, ens_pred in zip(
            future_dates, 
            model_predictions['xgb'], 
            model_predictions['xgbrf'],
            model_predictions['rf'],
            ensemble_predictions
        ):
            print(f"{date.strftime('%Y-%m-%d')} - XGB: {xgb_pred:.2f}, XGBRF: {xgbrf_pred:.2f}, RF: {rf_pred:.2f}, 融合: {ens_pred:.2f}")
        
        return future_dates, ensemble_predictions, models, prediction_intervals
        
    except Exception as e:
        print(f"预测过程异常: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None, None

# 添加特征名称生成函数
def generate_feature_names(df, feature_dim):
    """为特征矩阵生成可读的特征名称"""
    feature_names = []
    
    # 获取所有数值列
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'DateTime' in numeric_columns:
        numeric_columns.remove('DateTime')
    
    # 为每个时间步和每个数值列创建特征名称
    for i in range(7):
        for col in numeric_columns:
            feature_names.append(f'{col}_t{i}')
    
    # 添加时间特征名称
    feature_names.extend(['month', 'day', 'weekday'])
    
    # 添加额外的统计特征名称（针对最后24小时的特征）
    feature_names.extend([
        'recent_downstream_mean',
        'recent_downstream_std',
        'recent_downstream_change',
        'recent_downstream_rate'
    ])
    
    # 确保特征名称数量与特征维度匹配
    if len(feature_names) != feature_dim:
        print(f"特征名称数量({len(feature_names)})与特征维度({feature_dim})不匹配")
        # 使用自动生成的特征名称
        feature_names = [f'feature_{i}' for i in range(feature_dim)]
    
    return feature_names

# -------------------------------
# 获取模型准确度指标
# -------------------------------
def get_model_metrics():
    """获取保存在模型缓存中的准确度指标"""
    global prediction_mode
    
    # 根据当前预测模式选择模型缓存文件
    if prediction_mode == "青龙港-陈行":
        model_cache_file = 'salinity_model_qinglong.pkl'
    else:  # 太仓石化-陈行
        model_cache_file = 'salinity_model_taicang.pkl'
        
    if os.path.exists(model_cache_file):
        try:
            with open(model_cache_file, 'rb') as f:
                model_data = pickle.load(f)
                
                # 适配新的数据结构
                if 'metrics' in model_data:
                    # 新版本数据结构
                    return {
                        'rmse': model_data['metrics'].get('ensemble_rmse', 0),
                        'mae': model_data['metrics'].get('ensemble_mae', 0)
                    }
                elif 'rmse' in model_data and 'mae' in model_data:
                    # 旧版本数据结构
                    return {
                        'rmse': model_data.get('rmse', 0),
                        'mae': model_data.get('mae', 0)
                    }
                else:
                    # 无效的数据结构
                    print("无效的模型指标数据结构")
                    return {'rmse': 0, 'mae': 0}
        except Exception as e:
            print(f"获取模型指标失败: {e}")
    
    # 如果无法获取到指标，返回默认值
    return {'rmse': 0, 'mae': 0}

def run_gui():
    """运行GUI界面"""
    global qinglong_df, taicang_df
    def configure_gui_fonts():
        font_names = ['微软雅黑', 'Microsoft YaHei', 'SimSun', 'SimHei']
        for font_name in font_names:
            try:
                default_font = tkfont.nametofont("TkDefaultFont")
                default_font.configure(family=font_name)
                text_font = tkfont.nametofont("TkTextFont")
                text_font.configure(family=font_name)
                fixed_font = tkfont.nametofont("TkFixedFont")
                fixed_font.configure(family=font_name)
                return True
            except Exception as e:
                continue
        return False
        
    def switch_prediction_mode():
        global prediction_mode, current_df, df, cached_model, last_training_time
        
        # 切换预测模式
        if prediction_mode == "青龙港-陈行":
            prediction_mode = "太仓石化-陈行"
            current_df = taicang_df
            switch_button.config(text="切换到青龙港-陈行")
        else:
            prediction_mode = "青龙港-陈行"
            current_df = qinglong_df
            switch_button.config(text="切换到太仓石化-陈行")
            
        # 更新当前数据集
        df = current_df
        
        # 重置模型缓存
        cached_model = None
        last_training_time = None
        
        # 更新标题
        root.title(f"{prediction_mode}盐度预测系统")
        
        # 更新界面信息
        status_label.config(text=f"已切换到{prediction_mode}模式")
        
        # 更新模型指标
        model_metrics = get_model_metrics()
        metrics_text = "模型准确度: 未知" if not model_metrics else f"模型准确度 - RMSE: {model_metrics['rmse']:.4f}, MAE: {model_metrics['mae']:.4f}"
        metrics_label.config(text=metrics_text)
        
        # 显示历史数据
        display_history_data()

    def show_feature_importance():
        """显示特征重要性窗口，显示XGBoost的特征重要性"""
        try:
            # 检查模型缓存文件
            model_cache_file = 'salinity_model_qinglong.pkl' if prediction_mode == "青龙港-陈行" else 'salinity_model_taicang.pkl'
            if not os.path.exists(model_cache_file):
                status_label.config(text="无法获取特征重要性信息，请先训练模型")
                return
                
            # 创建新窗口
            importance_window = tk.Toplevel(root)
            importance_window.title(f"{prediction_mode} - 特征重要性分析")
            importance_window.geometry("800x600")
            
            try:
                with open(model_cache_file, 'rb') as f:
                    model_data = pickle.load(f)
                    
                # 创建XGBoost特征重要性表格
                xgb_frame = ttk.Frame(importance_window, padding=10)
                xgb_frame.pack(fill=tk.BOTH, expand=True)
                
                # 创建特征重要性表格
                xgb_tree = ttk.Treeview(xgb_frame, columns=("rank", "feature", "importance"), show="headings")
                xgb_tree.heading("rank", text="排名")
                xgb_tree.heading("feature", text="特征名称")
                xgb_tree.heading("importance", text="重要性")
                xgb_tree.column("rank", width=50)
                xgb_tree.column("feature", width=300)
                xgb_tree.column("importance", width=100)
                
                # 添加滚动条
                xgb_scrollbar = ttk.Scrollbar(xgb_frame, orient="vertical", command=xgb_tree.yview)
                xgb_tree.configure(yscrollcommand=xgb_scrollbar.set)
                
                # 放置表格和滚动条
                xgb_tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
                xgb_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
                
                # 获取特征重要性和特征名称
                feature_importance = model_data['models']['xgb'].feature_importances_
                feature_names = model_data.get('feature_columns', [f'feature_{i}' for i in range(len(feature_importance))])
                
                # 确保特征名称和重要性长度匹配
                if len(feature_names) != len(feature_importance):
                    feature_names = [f'feature_{i}' for i in range(len(feature_importance))]
                
                # 排序并填充表格
                sorted_idx = np.argsort(feature_importance)[::-1]
                for i, idx in enumerate(sorted_idx):
                    xgb_tree.insert("", tk.END, values=(i+1, feature_names[idx], f"{feature_importance[idx]:.6f}"))
                    
                # 添加特征重要性条形图
                fig_frame = ttk.Frame(importance_window, padding=10)
                fig_frame.pack(fill=tk.BOTH, expand=True)
                
                # 仅显示前20个重要特征
                top_n = min(20, len(sorted_idx))
                top_features = [feature_names[idx] for idx in sorted_idx[:top_n]]
                top_importance = [feature_importance[idx] for idx in sorted_idx[:top_n]]
                
                fig, ax = plt.subplots(figsize=(7, 4), dpi=100)
                y_pos = np.arange(len(top_features))
                
                # 创建水平条形图
                ax.barh(y_pos, top_importance, align='center')
                ax.set_yticks(y_pos)
                # 显示简化的特征名称（避免过长）
                ax.set_yticklabels([f[:20] + '...' if len(f) > 20 else f for f in top_features])
                ax.invert_yaxis()  # 最重要的在顶部
                ax.set_xlabel('特征重要性')
                ax.set_title('XGBoost Top 20 特征重要性')
                
                canvas = FigureCanvasTkAgg(fig, master=fig_frame)
                canvas.draw()
                canvas.get_tk_widget().pack(fill=tk.BOTH, expand=True)
                
            except Exception as e:
                print(f"加载XGBoost特征重要性失败: {e}")
            
            # 添加关闭按钮
            close_button = ttk.Button(importance_window, text="关闭", command=importance_window.destroy)
            close_button.pack(pady=10)
            
        except Exception as e:
            status_label.config(text=f"显示特征重要性时出错: {str(e)}")
            import traceback
            traceback.print_exc()

    def on_predict():
        try:
            predict_start = time()
            status_label.config(text="预测中...")
            root.update()
            start_time_dt = pd.to_datetime(entry.get())
            force_retrain = retrain_var.get()
            future_dates, predictions, model, prediction_intervals = train_and_predict(df, start_time_dt, force_retrain)
            if future_dates is None or predictions is None:
                status_label.config(text="预测失败")
                return

            # 获取并显示模型准确度指标
            model_metrics = get_model_metrics()
            if model_metrics and model_metrics['rmse'] > 0:
                metrics_text = f"模型准确度 - RMSE: {model_metrics['rmse']:.4f}, MAE: {model_metrics['mae']:.4f}"
            else:
                metrics_text = "模型准确度: 未知"
            metrics_label.config(text=metrics_text)

            # 清除图形并重新绘制
            ax.clear()
            
            # 绘制历史数据（预测时间点之前的所有数据）
            history_end = min(start_time_dt, df['DateTime'].max())
            history_start = df['DateTime'].min()  # 使用所有可用的历史数据
            hist_data = df[(df['DateTime'] >= history_start) & (df['DateTime'] <= history_end)]
      
            # 确保数据不为空
            if len(hist_data) == 0:
                status_label.config(text="错误: 所选时间范围内没有历史数据")
                return
                
            # 检查source_name列是否存在，如果不存在则使用默认值
            if 'source_name' in hist_data.columns:
                source = hist_data["source_name"].iloc[0]
            else:
                # 根据当前预测模式判断上游名称
                source = "青龙港" if prediction_mode == "青龙港-陈行" else "太仓石化"
            
            # 绘制基本数据
            ax.plot(hist_data['DateTime'], hist_data['downstream_smooth'], 
                    label='陈行(下游)盐度', color='blue', linewidth=1.5)
            ax.plot(hist_data['DateTime'], hist_data['upstream_smooth'], 
                    label=f'{source}(上游)盐度', color='purple', linewidth=1.5, alpha=0.7)
            
            # 添加盐度250的标注线
            ax.axhline(y=250, color='red', linestyle='--', alpha=0.7, linewidth=1.5, label='盐度警戒线 (250)')
            
            # 绘制预测数据
            if len(future_dates) > 0 and len(predictions) > 0:
                ax.plot(future_dates, predictions, marker='o', linestyle='--', 
                        label='递归预测盐度', color='red', linewidth=2)
                
                # 添加预测的置信区间
                if prediction_intervals is not None:
                    ax.fill_between(future_dates, prediction_intervals[0], prediction_intervals[1], 
                                   color='red', alpha=0.2, label='95% 置信区间')
            
            # 绘制实际数据(如果有)
            actual_data = df[(df['DateTime'] >= start_time_dt) & (df['DateTime'] <= future_dates[-1])]
            actual_values = None
            
            if not actual_data.empty:
                actual_values = []
                # 获取与预测日期最接近的实际数据
                for pred_date in future_dates:
                    closest_idx = np.argmin(np.abs(actual_data['DateTime'] - pred_date))
                    actual_values.append(actual_data['downstream_smooth'].iloc[closest_idx])
                
                # 绘制实际盐度曲线
                ax.plot(future_dates, actual_values, marker='s', linestyle='-', 
                        label='实际盐度', color='orange', linewidth=2)
            
            # 设置图表标题和标签
            ax.set_xlabel('日期')
            ax.set_ylabel('盐度')
            ax.set_title(f"{prediction_mode}从 {start_time_dt.strftime('%Y-%m-%d %H:%M:%S')} 开始的递归单步盐度预测")
            
            # 设置图例并应用紧凑布局
            ax.legend(loc='best')
            fig.tight_layout()
            
            # 保存初始视图范围用于重置
            global current_view
            current_view['xlim'] = ax.get_xlim()
            current_view['ylim'] = ax.get_ylim()
            
            # 强制重绘
            plt.close(fig)
            fig.canvas.draw()
            fig.canvas.flush_events()
            plt.draw()
            
            # 更新预测结果文本
            predict_time = time() - predict_start
            status_label.config(text=f"递归预测完成 (耗时: {predict_time:.2f}秒)")
            
            # 显示预测结果
            result_text = "递归单步预测结果:\n\n"
            
            # 如果有实际值，计算差值和百分比误差
            if actual_values is not None:
                result_text += "日期         预测值      实际值       差值\n"
                result_text += "--------------------------------------\n"
                for i, (date, pred, actual) in enumerate(zip(future_dates, predictions, actual_values)):
                    if actual is not None:  # 只在有实际值时显示差值
                        diff = pred - actual
                        result_text += f"{date.strftime('%Y-%m-%d')}  {pred:6.2f}    {actual:6.2f}    {diff:6.2f}\n"
                    else:
                        result_text += f"{date.strftime('%Y-%m-%d')}  {pred:6.2f}    --         --\n"
            else:
                result_text += "日期         预测值\n"
                result_text += "-------------------\n"
                for i, (date, pred) in enumerate(zip(future_dates, predictions)):
                    result_text += f"{date.strftime('%Y-%m-%d')}  {pred:6.2f}\n"
                result_text += "\n无实际值进行对比"
            
            update_result_text(result_text)
        except Exception as e:
            status_label.config(text=f"错误: {str(e)}")
            import traceback
            traceback.print_exc()

    def display_history_data():
        """显示历史盐度数据"""
        try:
            # 清除图形并重新绘制
            ax.clear()
            
            # 获取所有历史数据
            start_date = df['DateTime'].min()
            end_date = df['DateTime'].max()
            hist_data = df.copy()  # 使用所有数据
            
            # 确保数据不为空
            if len(hist_data) == 0:
                status_label.config(text="错误: 没有可用的历史数据")
                return
            
            # 绘制基本数据
            ax.plot(hist_data['DateTime'], hist_data['downstream_smooth'], 
                    label='陈行(下游)盐度', color='blue', linewidth=1.5)
                
            # 检查source_name列是否存在，如果不存在则使用默认值
            if 'source_name' in hist_data.columns:
                source = hist_data["source_name"].iloc[0]
            else:
                # 根据当前预测模式判断上游名称
                source = "青龙港" if prediction_mode == "青龙港-陈行" else "太仓石化"
            
            ax.plot(hist_data['DateTime'], hist_data['upstream_smooth'], 
                    label=f'{source}(上游)盐度', color='purple', linewidth=1.5, alpha=0.7)
            
            # 添加盐度250的标注线
            ax.axhline(y=250, color='red', linestyle='--', alpha=0.7, linewidth=1.5, label='盐度警戒线 (250)')
            
            # 设置图表标题和标签
            ax.set_xlabel('日期')
            ax.set_ylabel('盐度')
            ax.set_title(f"{prediction_mode}全部历史盐度数据 ({start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')})")
            
            # 设置图例并应用紧凑布局
            ax.legend(loc='best')
            fig.tight_layout()
            
            # 保存初始视图范围用于重置
            global current_view
            current_view['xlim'] = ax.get_xlim()
            current_view['ylim'] = ax.get_ylim()
            
            # 强制重绘
            plt.close(fig)
            fig.canvas.draw()
            fig.canvas.flush_events()
            plt.draw()
            
            status_label.config(text=f"显示全部历史数据 ({len(hist_data)} 个数据点)")
            
            # 更新结果文本
            result_text = "历史盐度统计信息:\n\n"
            result_text += f"数据时间范围: {start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}\n"
            result_text += f"数据点数量: {len(hist_data)}\n\n"
            result_text += f"{source}上游盐度:\n"
            result_text += f"  最小值: {hist_data['upstream_smooth'].min():.2f}\n"
            result_text += f"  最大值: {hist_data['upstream_smooth'].max():.2f}\n"
            result_text += f"  平均值: {hist_data['upstream_smooth'].mean():.2f}\n"
            result_text += f"  标准差: {hist_data['upstream_smooth'].std():.2f}\n\n"
            result_text += "陈行下游盐度:\n"
            result_text += f"  最小值: {hist_data['downstream_smooth'].min():.2f}\n"
            result_text += f"  最大值: {hist_data['downstream_smooth'].max():.2f}\n"
            result_text += f"  平均值: {hist_data['downstream_smooth'].mean():.2f}\n"
            result_text += f"  标准差: {hist_data['downstream_smooth'].std():.2f}\n"
            
            update_result_text(result_text)
        except Exception as e:
            status_label.config(text=f"显示历史数据时出错: {str(e)}")
            import traceback
            traceback.print_exc()

    def on_scroll(event):
        xlim = ax.get_xlim()
        ylim = ax.get_ylim()
        zoom_factor = 1.1
        x_data = event.xdata if event.xdata is not None else (xlim[0]+xlim[1])/2
        y_data = event.ydata if event.ydata is not None else (ylim[0]+ylim[1])/2
        x_rel = (x_data - xlim[0]) / (xlim[1] - xlim[0])
        y_rel = (y_data - ylim[0]) / (ylim[1] - ylim[0])
        if event.step > 0:
            new_width = (xlim[1]-xlim[0]) / zoom_factor
            new_height = (ylim[1]-ylim[0]) / zoom_factor
            x0 = x_data - x_rel * new_width
            y0 = y_data - y_rel * new_height
            ax.set_xlim([x0, x0+new_width])
            ax.set_ylim([y0, y0+new_height])
        else:
            new_width = (xlim[1]-xlim[0]) * zoom_factor
            new_height = (ylim[1]-ylim[0]) * zoom_factor
            x0 = x_data - x_rel * new_width
            y0 = y_data - y_rel * new_height
            ax.set_xlim([x0, x0+new_width])
            ax.set_ylim([y0, y0+new_height])
        canvas.draw_idle()

    def update_cursor(event):
        if event.inaxes == ax:
            canvas.get_tk_widget().config(cursor="fleur")
        else:
            canvas.get_tk_widget().config(cursor="")

    def reset_view():
        global current_view
        if current_view['xlim'] is not None:
            # 应用保存的视图范围
            ax.set_xlim(current_view['xlim'])
            ax.set_ylim(current_view['ylim'])
            
            # 应用紧凑布局并重绘
            fig.tight_layout()
            canvas.draw_idle()
            status_label.config(text="图表视图已重置")
        else:
            status_label.config(text="没有可用的初始视图范围")

    root = tk.Tk()
    root.title(f"{prediction_mode}盐度预测系统")
    try:
        configure_gui_fonts()
    except Exception as e:
        print("字体配置异常:", e)
        
    # 恢复输入框和控制按钮
    input_frame = ttk.Frame(root, padding="10")
    input_frame.pack(fill=tk.X)
    
    ttk.Label(input_frame, text="输入开始时间 (YYYY-MM-DD HH:MM:SS)").pack(side=tk.LEFT)
    entry = ttk.Entry(input_frame, width=25)
    entry.pack(side=tk.LEFT, padx=5)
    predict_button = ttk.Button(input_frame, text="预测", command=on_predict)
    predict_button.pack(side=tk.LEFT)
    status_label = ttk.Label(input_frame, text="提示: 第一次运行请勾选'强制重新训练模型'")
    status_label.pack(side=tk.LEFT, padx=10)
    
    control_frame = ttk.Frame(root, padding="5")
    control_frame.pack(fill=tk.X)
    retrain_var = tk.BooleanVar(value=False)
    ttk.Checkbutton(control_frame, text="强制重新训练模型", variable=retrain_var).pack(side=tk.LEFT)
    
    # 添加显示历史数据按钮
    history_button = ttk.Button(control_frame, text="显示历史数据", command=display_history_data)
    history_button.pack(side=tk.LEFT, padx=5)
    
    # 添加切换数据源的按钮
    switch_button = ttk.Button(control_frame, text="切换到太仓石化-陈行", command=switch_prediction_mode)
    switch_button.pack(side=tk.LEFT, padx=5)
    
    # # 添加查看特征重要性按钮
    # feature_button = ttk.Button(control_frame, text="查看特征重要性", command=show_feature_importance)
    # feature_button.pack(side=tk.LEFT, padx=5)
    
    # 更新图例说明，添加盐度警戒线信息
    legend_label = ttk.Label(control_frame, text="图例: 紫色=上游数据, 蓝色=下游数据, 红色=预测值, 橙色=实际值, 红色虚线=盐度警戒线(250)")
    legend_label.pack(side=tk.LEFT, padx=10)
    reset_button = ttk.Button(control_frame, text="重置视图", command=reset_view)
    reset_button.pack(side=tk.LEFT, padx=5)
    
    # 添加显示模型准确度的标签
    metrics_frame = ttk.Frame(root, padding="5")
    metrics_frame.pack(fill=tk.X)
    model_metrics = get_model_metrics()
    metrics_text = "模型准确度: 未知" if not model_metrics else f"模型准确度 - RMSE: {model_metrics['rmse']:.4f}, MAE: {model_metrics['mae']:.4f}"
    metrics_label = ttk.Label(metrics_frame, text=metrics_text)
    metrics_label.pack(side=tk.LEFT, padx=10)
    
    # 结果显示区域
    result_frame = ttk.Frame(root, padding="10")
    result_frame.pack(fill=tk.BOTH, expand=True)
    
    # 左侧放置图表
    plot_frame = ttk.Frame(result_frame, width=800, height=600)
    plot_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
    plot_frame.pack_propagate(False)  # 不允许框架根据内容调整大小
    
    # 右侧放置文本结果
    text_frame = ttk.Frame(result_frame)
    text_frame.pack(side=tk.RIGHT, fill=tk.Y)
    
    # 使用等宽字体显示结果
    result_font = tkfont.Font(family="Courier New", size=10, weight="normal")
    
    # 添加文本框和滚动条
    result_text = tk.Text(text_frame, width=50, height=25, font=result_font, wrap=tk.NONE)
    result_text.pack(side=tk.LEFT, fill=tk.BOTH)
    result_scroll = ttk.Scrollbar(text_frame, orient="vertical", command=result_text.yview)
    result_scroll.pack(side=tk.RIGHT, fill=tk.Y)
    result_text.configure(yscrollcommand=result_scroll.set)
    result_text.configure(state=tk.DISABLED)  # 初始设为只读

    # 更新结果文本的函数
    def update_result_text(text):
        result_text.configure(state=tk.NORMAL)
        result_text.delete(1.0, tk.END)
        result_text.insert(tk.END, text)
        result_text.configure(state=tk.DISABLED)

    # 创建更高DPI的图形以获得更好的显示质量
    fig, ax = plt.subplots(figsize=(10, 6), dpi=100)
    fig.tight_layout(pad=3.0)  # 增加内边距，防止标签被截断
    
    # 创建画布并添加到固定大小的框架
    canvas = FigureCanvasTkAgg(fig, master=plot_frame)
    canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=True)
    
    # 添加工具栏，包含缩放、保存等功能
    toolbar_frame = ttk.Frame(plot_frame)
    toolbar_frame.pack(side=tk.BOTTOM, fill=tk.X)
    toolbar = NavigationToolbar2Tk(canvas, toolbar_frame)
    toolbar.update()
    
    # 启用紧凑布局，并设置自动调整以使图表完全显示
    def on_resize(event):
        fig.tight_layout()
        canvas.draw_idle()
    
    # 添加图表交互功能
    canvas.mpl_connect('resize_event', on_resize)
    canvas.mpl_connect('scroll_event', on_scroll)
    canvas.mpl_connect('motion_notify_event', update_cursor)
    
    # 添加鼠标拖动功能
    def on_press(event):
        if event.inaxes != ax:
            return
        canvas.get_tk_widget().config(cursor="fleur")
        ax._pan_start = (event.x, event.y, event.xdata, event.ydata)
    
    def on_release(event):
        ax._pan_start = None
        canvas.get_tk_widget().config(cursor="")
        canvas.draw_idle()
    
    def on_motion(event):
        if not hasattr(ax, '_pan_start') or ax._pan_start is None:
            return
        if event.inaxes != ax:
            return
        
        start_x, start_y, x_data, y_data = ax._pan_start
        dx = event.x - start_x
        dy = event.y - start_y
        
        # 获取当前视图
        xlim = ax.get_xlim()
        ylim = ax.get_ylim()
        
        # 计算图表坐标系中的移动
        x_scale = (xlim[1] - xlim[0]) / canvas.get_tk_widget().winfo_width()
        y_scale = (ylim[1] - ylim[0]) / canvas.get_tk_widget().winfo_height()
        
        # 更新视图
        ax.set_xlim(xlim[0] - dx * x_scale, xlim[1] - dx * x_scale)
        ax.set_ylim(ylim[0] + dy * y_scale, ylim[1] + dy * y_scale)
        
        # 更新拖动起点
        ax._pan_start = (event.x, event.y, event.xdata, event.ydata)
        
        canvas.draw_idle()
    
    # 连接鼠标事件
    canvas.mpl_connect('button_press_event', on_press)
    canvas.mpl_connect('button_release_event', on_release)
    canvas.mpl_connect('motion_notify_event', on_motion)
    
    # 初始显示历史数据
    display_history_data()
    
    root.mainloop()

# 向量化构造训练样本（优化特征工程）
# -------------------------------
def create_features_vectorized(df, look_back=168, forecast_horizon=1):
    """
    向量化构造训练样本，使用过去7天的所有原始数据来预测未来1天的下游盐度均值
    """
    try:
        # 确保数据按时间排序
        df = df.sort_values('DateTime')
        
        # 获取所有数值列（排除DateTime列）
        numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
        if 'DateTime' in numeric_columns:
            numeric_columns.remove('DateTime')
        
        # 初始化特征和标签列表
        features = []  # x输入
        targets = []   # y输出
        
        # 使用滑动窗口创建样本
        for i in range(len(df) - look_back - forecast_horizon + 1):
            # 获取7天的特征窗口
            window = df.iloc[i:i+look_back]
            
            # 提取特征 - 使用所有原始数据
            window_features = []
            for col in numeric_columns:
                # 获取列数据并处理NaN值
                col_values = window[col].fillna(method='ffill').fillna(method='bfill').values
                window_features.extend(col_values)
            
            # 添加时间特征
            current_date = window['DateTime'].iloc[-1]
            window_features.extend([
                current_date.month,
                current_date.day,
                current_date.weekday()
            ])
            
            # 新增：增强最近时段数据的权重（最后24小时的数据权重提高）
            # 提取最后24小时的上游和下游盐度数据
            last_24h = window.iloc[-24:]
            if len(last_24h) == 24:
                # # 上游盐度的最后24小时数据，每小时一个点
                # recent_upstream = last_24h['upstream_smooth'].values
                # 下游盐度的最后24小时数据，每小时一个点
                recent_downstream = last_24h['downstream_smooth'].values
                
                # 计算最后24小时的统计特征
                recent_stats = [
                    # np.mean(recent_upstream),          # 均值
                    # np.std(recent_upstream),           # 标准差
                    np.mean(recent_downstream),        # 均值
                    np.std(recent_downstream),         # 标准差
                    # recent_upstream[-1] - recent_upstream[0],  # 总变化
                    recent_downstream[-1] - recent_downstream[0],  # 总变化
                    # np.mean(np.diff(recent_upstream)),  # 平均变化率
                    np.mean(np.diff(recent_downstream))  # 平均变化率
                ]
                
                # 添加到特征中，并给这些特征3倍的权重（重复添加）
                window_features.extend(recent_stats)
                window_features.extend(recent_stats)  # 重复一次增加权重
                window_features.extend(recent_stats)  # 再次重复
            
            # 获取目标值（未来1天的下游盐度均值）
            next_day = df.iloc[i+look_back:i+look_back+24]  # 获取未来24小时的数据
            # 处理目标值中的NaN
            target_values = next_day['downstream_smooth'].fillna(method='ffill').fillna(method='bfill').values
            target = np.mean(target_values)
            
            # 检查特征和目标值是否有效
            if not np.any(np.isnan(window_features)) and not np.isnan(target) and not np.isinf(target):
                features.append(window_features)
                targets.append(target)
        
        if not features:
            print("警告: 未能生成任何有效特征")
            return np.array([]), np.array([])
            
        # 转换为numpy数组
        X = np.array(features)
        y = np.array(targets)
        
        print(f"成功生成特征矩阵，形状: {X.shape}")
        return X, y
        
    except Exception as e:
        print(f"特征创建异常: {e}")
        return np.array([]), np.array([])

def generate_prediction_features(df, current_date, look_back=168):
    """
    为预测生成特征，使用与create_features_vectorized相同的特征生成逻辑
    """
    try:
        # 确保数据按时间排序
        df = df.sort_values('DateTime')
        
        # 获取所有数值列（排除DateTime列）
        numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
        if 'DateTime' in numeric_columns:
            numeric_columns.remove('DateTime')
        
        # 找到当前日期在数据中的位置
        current_idx = df[df['DateTime'] <= current_date].index[-1]
        
        # 获取过去168小时（7天）的数据窗口
        if current_idx < look_back:
            print(f"数据不足，需要{look_back}小时的数据，但只有{current_idx+1}小时")
            return None
            
        window = df.iloc[current_idx-look_back+1:current_idx+1]
        
        # 提取特征 - 使用所有原始数据
        features = []
        for col in numeric_columns:
            # 直接使用原始数据作为特征
            features.extend(window[col].values)
        
        # 添加时间特征
        features.extend([
            current_date.month,
            current_date.day,
            current_date.weekday()
        ])
        
        # 新增：增强最近时段数据的权重（最后24小时的数据权重提高）
        # 提取最后24小时的数据
        last_24h = window.iloc[-24:]
        if len(last_24h) == 24:
            # 上游盐度的最后24小时数据，每小时一个点
            recent_upstream = last_24h['upstream_smooth'].values
            # 下游盐度的最后24小时数据，每小时一个点
            recent_downstream = last_24h['downstream_smooth'].values
            
            # 计算最后24小时的统计特征
            recent_stats = [
                np.mean(recent_upstream),          # 均值
                np.std(recent_upstream),           # 标准差
                np.mean(recent_downstream),        # 均值
                np.std(recent_downstream),         # 标准差
                recent_upstream[-1] - recent_upstream[0],  # 总变化
                recent_downstream[-1] - recent_downstream[0],  # 总变化
                np.mean(np.diff(recent_upstream)),  # 平均变化率
                np.mean(np.diff(recent_downstream))  # 平均变化率
            ]
            
            # 添加到特征中，并给这些特征3倍的权重（重复添加）
            features.extend(recent_stats)
            features.extend(recent_stats)  # 重复一次增加权重
            features.extend(recent_stats)  # 再次重复
        
        return np.array(features)
        
    except Exception as e:
        print(f"预测特征生成异常: {e}")
        return None

# 主函数
def main():
    global df, current_df, qinglong_df, taicang_df
    # 加载两个数据集
    qinglong_df, taicang_df = load_both_datasets()
    current_df = qinglong_df  # 默认使用青龙港-陈行数据集

    if current_df is not None:
        df = current_df  # 设置当前使用的数据集
        run_gui()
    else:
        print("数据加载失败，无法运行预测。")

# 在程序入口处调用main函数
if __name__ == "__main__":
    main()