import pandas as pd import numpy as np import tkinter as tk from tkinter import ttk from datetime import timedelta import matplotlib.pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk from sklearn.model_selection import train_test_split from xgboost import XGBRegressor import pickle import os from time import time import matplotlib # 配置matplotlib中文显示 # 设置中文字体,使用系统提供的字体 matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS'] # 优先使用的中文字体 matplotlib.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 matplotlib.rcParams['font.family'] = 'sans-serif' # 使用无衬线字体 # 缓存变量 cached_model = None last_training_time = None feature_columns = None # 数据加载函数 def load_data(upstream_file, downstream_file, qinglong_lake_file=None): try: # 使用逗号作为分隔符读取数据 upstream_df = pd.read_csv(upstream_file) downstream_df = pd.read_csv(downstream_file) if qinglong_lake_file: qinglong_lake_df = pd.read_csv(qinglong_lake_file) except FileNotFoundError: print("文件未找到,请检查文件路径") return None # 假设列名被读取为 'DateTime,TagName,Value',我们需要分割 upstream_df.columns = ['DateTime', 'TagName', 'Value'] downstream_df.columns = ['DateTime', 'TagName', 'Value'] if qinglong_lake_file: qinglong_lake_df.columns = ['DateTime', 'TagName', 'Value'] # 将 'DateTime' 列转换为日期格式 upstream_df['DateTime'] = pd.to_datetime(upstream_df['DateTime']) downstream_df['DateTime'] = pd.to_datetime(downstream_df['DateTime']) if qinglong_lake_file: qinglong_lake_df['DateTime'] = pd.to_datetime(qinglong_lake_df['DateTime']) # 检测并处理异常值 (先转换为数值型) upstream_df['Value'] = pd.to_numeric(upstream_df['Value'], errors='coerce') downstream_df['Value'] = pd.to_numeric(downstream_df['Value'], errors='coerce') if qinglong_lake_file: qinglong_lake_df['Value'] = pd.to_numeric(qinglong_lake_df['Value'], errors='coerce') # 过滤掉盐度值小于5的数据 upstream_df = upstream_df[upstream_df['Value'] >= 5] downstream_df = downstream_df[downstream_df['Value'] >= 5] if qinglong_lake_file: qinglong_lake_df = qinglong_lake_df[qinglong_lake_df['Value'] >= 5] # 将0值替换为NaN,以便后续进行插值处理 upstream_df.loc[upstream_df['Value'] == 0, 'Value'] = np.nan downstream_df.loc[downstream_df['Value'] == 0, 'Value'] = np.nan if qinglong_lake_file: qinglong_lake_df.loc[qinglong_lake_df['Value'] == 0, 'Value'] = np.nan # 使用基本统计方法识别并替换异常值 (3倍标准差法) for df in [upstream_df, downstream_df]: mean = df['Value'].mean() std = df['Value'].std() lower_bound = mean - 3 * std upper_bound = mean + 3 * std # 将超出范围的值替换为NaN df.loc[(df['Value'] < lower_bound) | (df['Value'] > upper_bound), 'Value'] = np.nan if qinglong_lake_file: mean = qinglong_lake_df['Value'].mean() std = qinglong_lake_df['Value'].std() lower_bound = mean - 3 * std upper_bound = mean + 3 * std qinglong_lake_df.loc[(qinglong_lake_df['Value'] < lower_bound) | (qinglong_lake_df['Value'] > upper_bound), 'Value'] = np.nan # 重命名 'Value' 为 'upstream' 和 'downstream' upstream_df = upstream_df.rename(columns={'Value': 'upstream'})[['DateTime', 'upstream']] downstream_df = downstream_df.rename(columns={'Value': 'downstream'})[['DateTime', 'downstream']] if qinglong_lake_file: qinglong_lake_df = qinglong_lake_df.rename(columns={'Value': 'qinglong_lake'})[['DateTime', 'qinglong_lake']] # 合并数据 merged_df = pd.merge(upstream_df, downstream_df, on='DateTime', how='inner') if qinglong_lake_file: merged_df = pd.merge(merged_df, qinglong_lake_df, on='DateTime', how='left') # 处理 NaN 和无效值 print(f"合并前数据行数: {len(merged_df)}") # 设置DateTime为索引以允许时间插值 merged_df = merged_df.set_index('DateTime') # 使用多种插值方法处理NaN值 # 1. 首先使用线性插值填充短时间的NaN merged_df['upstream'] = merged_df['upstream'].interpolate(method='linear', limit=4) merged_df['downstream'] = merged_df['downstream'].interpolate(method='linear', limit=4) if qinglong_lake_file: merged_df['qinglong_lake'] = merged_df['qinglong_lake'].interpolate(method='linear', limit=4) # 2. 对于较长时间的NaN,使用时间加权插值 merged_df['upstream'] = merged_df['upstream'].interpolate(method='time', limit=24) merged_df['downstream'] = merged_df['downstream'].interpolate(method='time', limit=24) if qinglong_lake_file: merged_df['qinglong_lake'] = merged_df['qinglong_lake'].interpolate(method='time', limit=24) # 3. 对于仍然存在的NaN,使用前向填充和后向填充 merged_df['upstream'] = merged_df['upstream'].fillna(method='ffill').fillna(method='bfill') merged_df['downstream'] = merged_df['downstream'].fillna(method='ffill').fillna(method='bfill') if qinglong_lake_file: merged_df['qinglong_lake'] = merged_df['qinglong_lake'].fillna(method='ffill').fillna(method='bfill') # 4. 添加平滑处理 # 使用移动平均进行平滑处理 merged_df['upstream_smooth'] = merged_df['upstream'].rolling(window=24, min_periods=1, center=True).mean() merged_df['downstream_smooth'] = merged_df['downstream'].rolling(window=24, min_periods=1, center=True).mean() if qinglong_lake_file: merged_df['qinglong_lake_smooth'] = merged_df['qinglong_lake'].rolling(window=24, min_periods=1, center=True).mean() # 对青龙港数据中盐度值低于50的部分进行额外平滑处理 low_salinity_mask = merged_df['upstream'] < 50 if low_salinity_mask.any(): # 对低盐度部分使用更大的平滑窗口 merged_df.loc[low_salinity_mask, 'upstream_smooth'] = merged_df.loc[low_salinity_mask, 'upstream'].rolling( window=48, min_periods=1, center=True).mean() # 删除剩余的NaN和无穷大值 merged_df = merged_df.dropna() merged_df = merged_df[merged_df['upstream'].apply(lambda x: np.isfinite(x))] merged_df = merged_df[merged_df['downstream'].apply(lambda x: np.isfinite(x))] if qinglong_lake_file: merged_df = merged_df[merged_df['qinglong_lake'].apply(lambda x: np.isfinite(x))] # 重置索引,将DateTime重新作为列 merged_df = merged_df.reset_index() # 最终检查数据 print(f"清洗后数据行数: {len(merged_df)}") print(f"上游盐度范围: {merged_df['upstream'].min()} - {merged_df['upstream'].max()}") print(f"下游盐度范围: {merged_df['downstream'].min()} - {merged_df['downstream'].max()}") if qinglong_lake_file: print(f"青龙湖盐度范围: {merged_df['qinglong_lake'].min()} - {merged_df['qinglong_lake'].max()}") # 确保数据按时间排序 merged_df = merged_df.sort_values('DateTime') return merged_df # 特征工程 - 高性能优化版 def create_features(df, look_back=96, forecast_horizon=5): print("开始特征工程...") start_time = time() # 创建一个滑动窗口列表 total_samples = len(df) - look_back - forecast_horizon if total_samples <= 0: print("数据不足以创建特征") return np.array([]), np.array([]) # 预先准备数据,以减少循环中的计算 # 1. 预先计算所有延迟特征 - 使用向量化操作 print("预计算延迟特征...") # 增加更多短期延迟特征,特别是最近的时间点 delay_hours = [1, 2, 3, 4, 6, 12, 24, 36, 48, 60, 72, 84, 96, 108, 120] delay_features_dict = {} # 使用Pandas的shift功能,按小时计算所有延迟数据 # 设置日期为索引进行操作 df_indexed = df.set_index('DateTime') # 计算每个小时延迟的特征 - 包括上游和下游数据 for delay in delay_hours: # 上游数据延迟特征 delay_name_upstream = f'upstream_delay_{delay}h' delay_values_upstream = pd.Series(index=df_indexed.index) # 下游数据延迟特征 delay_name_downstream = f'downstream_delay_{delay}h' delay_values_downstream = pd.Series(index=df_indexed.index) for i, dt in enumerate(df_indexed.index): target_dt = dt - timedelta(hours=delay) # 找到最接近的索引 if target_dt in df_indexed.index: delay_values_upstream[dt] = df_indexed.loc[target_dt, 'upstream'] delay_values_downstream[dt] = df_indexed.loc[target_dt, 'downstream'] else: # 找到最接近的时间点 closest_dt = df_indexed.index[df_indexed.index.get_indexer([target_dt], method='nearest')[0]] delay_values_upstream[dt] = df_indexed.loc[closest_dt, 'upstream'] delay_values_downstream[dt] = df_indexed.loc[closest_dt, 'downstream'] delay_features_dict[delay_name_upstream] = delay_values_upstream delay_features_dict[delay_name_downstream] = delay_values_downstream # 重置索引便于后续处理 df_indexed = df_indexed.reset_index() # 2. 预先计算时间特征 date_features = df['DateTime'].iloc[look_back:look_back+total_samples].apply( lambda x: [x.hour/24, x.dayofweek/7, x.month/12] ).tolist() # 3. 预先计算统计特征 - 使用滑动窗口 print("预计算统计特征...") upstream = df['upstream'].values downstream = df['downstream'].values mean_1d_values_up = [] mean_3d_values_up = [] std_1d_values_up = [] max_1d_values_up = [] min_1d_values_up = [] mean_1d_values_down = [] mean_3d_values_down = [] std_1d_values_down = [] max_1d_values_down = [] min_1d_values_down = [] # 使用NumPy的滑动窗口计算效率更高 for i in range(look_back, look_back + total_samples): # 上游数据统计 day_window_up = upstream[max(0, i-24):i] mean_1d_values_up.append(np.mean(day_window_up)) std_1d_values_up.append(np.std(day_window_up)) max_1d_values_up.append(np.max(day_window_up)) min_1d_values_up.append(np.min(day_window_up)) three_day_window_up = upstream[max(0, i-72):i] mean_3d_values_up.append(np.mean(three_day_window_up)) # 下游数据统计 day_window_down = downstream[max(0, i-24):i] mean_1d_values_down.append(np.mean(day_window_down)) std_1d_values_down.append(np.std(day_window_down)) max_1d_values_down.append(np.max(day_window_down)) min_1d_values_down.append(np.min(day_window_down)) three_day_window_down = downstream[max(0, i-72):i] mean_3d_values_down.append(np.mean(three_day_window_down)) # 创建特征和标签数组 print("生成最终特征矩阵...") features = [] labels = [] # 使用批量处理,减少循环次数 batch_size = 1000 for batch_start in range(0, total_samples, batch_size): batch_end = min(batch_start + batch_size, total_samples) print(f"处理样本批次: {batch_start}-{batch_end}/{total_samples}") for i in range(batch_start, batch_end): idx = i + look_back # 基本时间序列窗口 - 包括上游和下游数据 window_up = upstream[i:i+look_back].copy() window_down = downstream[i:i+look_back].copy() # 跳过含NaN的窗口 if np.isnan(window_up).any() or np.isnan(window_down).any(): continue # 时间特征 time_feats = date_features[i-batch_start] # 统计特征 - 上游 stats_feats_up = [ mean_1d_values_up[i], mean_3d_values_up[i], std_1d_values_up[i], max_1d_values_up[i], min_1d_values_up[i] ] # 统计特征 - 下游 stats_feats_down = [ mean_1d_values_down[i], mean_3d_values_down[i], std_1d_values_down[i], max_1d_values_down[i], min_1d_values_down[i] ] # 延迟特征 current_dt = df['DateTime'].iloc[idx] delay_feats = [] for delay in delay_hours: delay_name_up = f'upstream_delay_{delay}h' delay_name_down = f'downstream_delay_{delay}h' delay_val_up = delay_features_dict[delay_name_up].get(current_dt, np.nan) delay_val_down = delay_features_dict[delay_name_down].get(current_dt, np.nan) delay_feats.extend([delay_val_up, delay_val_down]) # 处理延迟特征中的NaN if any(np.isnan(delay_feats)): mean_val = np.nanmean(delay_feats) delay_feats = [mean_val if np.isnan(x) else x for x in delay_feats] # 计算最近时间点的变化率特征 - 上游和下游 recent_changes_up = [] recent_changes_down = [] for j in range(1, 6): # 计算最近5个时间点的变化率 if i-j >= 0: change_up = (upstream[i] - upstream[i-j]) / j change_down = (downstream[i] - downstream[i-j]) / j recent_changes_up.append(change_up) recent_changes_down.append(change_down) else: recent_changes_up.append(0) recent_changes_down.append(0) # 计算最近时间点的加速度特征 - 上游和下游 recent_accelerations_up = [] recent_accelerations_down = [] for j in range(2, 6): # 计算最近4个时间点的加速度 if i-j >= 0: acc_up = ((upstream[i] - upstream[i-1]) - (upstream[i-1] - upstream[i-j])) / (j-1) acc_down = ((downstream[i] - downstream[i-1]) - (downstream[i-1] - downstream[i-j])) / (j-1) recent_accelerations_up.append(acc_up) recent_accelerations_down.append(acc_down) else: recent_accelerations_up.append(0) recent_accelerations_down.append(0) # 计算下游数据的短期趋势特征 downstream_trend = np.polyfit(range(12), downstream[i+look_back-12:i+look_back], 1)[0] # 合并所有特征 feature = np.concatenate([ window_up, # 上游时间窗口数据 window_down[-24:], # 下游最近24小时数据(重点关注最近数据) time_feats, # 时间特征 stats_feats_up, # 上游统计特征 stats_feats_down, # 下游统计特征 delay_feats, # 延迟特征 recent_changes_up, # 上游最近变化率 recent_changes_down, # 下游最近变化率 recent_accelerations_up, # 上游最近加速度 recent_accelerations_down, # 下游最近加速度 [downstream_trend] # 下游趋势特征 ]) # 标签 label = df['downstream'].iloc[idx:idx+forecast_horizon].values # 确保标签没有NaN if not np.isnan(label).any(): features.append(feature) labels.append(label) if not features: print("警告:没有能够生成有效的特征和标签对") return np.array([]), np.array([]) end_time = time() print(f"特征工程完成,有效样本数: {len(features)},耗时: {end_time-start_time:.2f}秒") # 保存特征列名称以备将来使用 global feature_columns feature_columns = ( [f'upstream_t-{i}' for i in range(look_back, 0, -1)] + # 上游时间窗口 [f'downstream_t-{i}' for i in range(24, 0, -1)] + # 下游最近24小时 ['hour', 'day_of_week', 'month'] + # 时间特征 ['mean_1d_up', 'mean_3d_up', 'std_1d_up', 'max_1d_up', 'min_1d_up'] + # 上游统计特征 ['mean_1d_down', 'mean_3d_down', 'std_1d_down', 'max_1d_down', 'min_1d_down'] + # 下游统计特征 [f'delay_{h}h_up' for h in delay_hours] + # 上游延迟特征 [f'delay_{h}h_down' for h in delay_hours] + # 下游延迟特征 [f'recent_change_{i}h_up' for i in range(1, 6)] + # 上游最近变化率 [f'recent_change_{i}h_down' for i in range(1, 6)] + # 下游最近变化率 [f'recent_acceleration_{i}h_up' for i in range(2, 6)] + # 上游最近加速度 [f'recent_acceleration_{i}h_down' for i in range(2, 6)] + # 下游最近加速度 ['downstream_trend'] # 下游趋势特征 ) return np.array(features), np.array(labels) # 训练和预测 def train_and_predict(df, start_time, force_retrain=False): global cached_model, last_training_time # 检查是否有缓存的模型和上次训练的时间 model_cache_file = 'salinity_model.pkl' model_needs_training = True # 特征结构已改变,强制重新训练 - 删除旧模型文件 if os.path.exists(model_cache_file) and force_retrain: try: os.remove(model_cache_file) print("特征结构已更改,已删除旧模型缓存") except: pass # 只使用时间点之前的数据 train_df = df[df['DateTime'] < start_time].copy() # 如果有缓存模型且不需要强制重新训练 if not force_retrain and cached_model is not None and last_training_time is not None: # 如果上次训练后没有新数据,使用缓存的模型 if last_training_time >= train_df['DateTime'].max(): model_needs_training = False print(f"使用缓存模型 (上次训练时间: {last_training_time})") # 如果文件存在且不需要强制重新训练 elif not force_retrain and os.path.exists(model_cache_file): try: with open(model_cache_file, 'rb') as f: model_data = pickle.load(f) cached_model = model_data['model'] last_training_time = model_data['training_time'] # 如果上次训练后没有新数据,使用缓存的模型 if last_training_time >= train_df['DateTime'].max(): model_needs_training = False print(f"从文件加载模型 (上次训练时间: {last_training_time})") except Exception as e: print(f"加载模型失败: {e}") if model_needs_training: print("需要训练新模型...") if len(train_df) < 100: # 确保有足够的训练数据 print("训练数据不足") return None, None, None # 检查训练数据质量 print(f"训练数据范围: {train_df['DateTime'].min()} 到 {train_df['DateTime'].max()}") print(f"训练数据中NaN值统计:\n上游: {train_df['upstream'].isna().sum()}\n下游: {train_df['downstream'].isna().sum()}") # 计时开始 start_time_training = time() # 进行特征工程 X, y = create_features(train_df) # 检查是否有足够的样本 if len(X) == 0 or len(y) == 0: print("没有足够的有效样本进行训练") return None, None, None # 检查样本数量 print(f"用于训练的样本数: {len(X)}") # 划分训练集和验证集 X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # 超参数优化的XGBoost模型 model = XGBRegressor( n_estimators=300, # 增加树的数量 learning_rate=0.03, # 降低学习率以减少过拟合 max_depth=5, # 减小树的深度以控制复杂度 min_child_weight=2, # 增加以更好地控制过拟合 subsample=0.85, # 每棵树随机采样的子样本比例 colsample_bytree=0.85, # 每棵树随机采样的特征比例 gamma=0.1, # 增加以减少过拟合 reg_alpha=0.2, # L1正则化 reg_lambda=1.5, # L2正则化 random_state=42 ) # 训练模型 try: print("开始训练模型...") model.fit( X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', early_stopping_rounds=20, verbose=False ) # 计算验证集性能 val_pred = model.predict(X_val) rmse = np.sqrt(np.mean((val_pred - y_val) ** 2)) print(f"验证集RMSE: {rmse:.4f}") # 记录训练时间 last_training_time = start_time # 缓存模型 cached_model = model # 保存模型到文件 with open(model_cache_file, 'wb') as f: pickle.dump({ 'model': model, 'training_time': last_training_time, 'feature_columns': feature_columns, 'rmse': rmse }, f) print(f"模型训练完成,耗时: {time()-start_time_training:.2f}秒") # 分析特征重要性 importance = model.feature_importances_ feature_importance = sorted(zip(feature_columns, importance), key=lambda x: x[1], reverse=True) print("特征重要性排名 (前10):") for feature, score in feature_importance[:10]: print(f"{feature}: {score:.4f}") except Exception as e: print(f"模型训练失败: {e}") return None, None, None else: # 使用缓存的模型 model = cached_model # 准备最新数据进行预测 try: print("准备预测数据...") # 获取更多的历史数据以考虑延迟 latest_data = df[df['DateTime'] < start_time].tail(150) # 增加取样范围 if len(latest_data) < 96: # 确保有足够的历史数据 print("预测所需的历史数据不足") return None, None, None # 快速提取所有特征 latest_window_up = latest_data['upstream'].tail(96).values # 使用4天的数据 latest_window_down = latest_data['downstream'].tail(24).values # 使用最近24小时的下游数据 # 检查窗口是否包含NaN并处理 if np.isnan(latest_window_up).any() or np.isnan(latest_window_down).any(): print("预测窗口包含NaN值") latest_window_up = pd.Series(latest_window_up).fillna(method='ffill').values latest_window_down = pd.Series(latest_window_down).fillna(method='ffill').values # 时间特征 hour = start_time.hour / 24 day_of_week = start_time.dayofweek / 7 month = start_time.month / 12 # 统计特征 - 上游 recent_values_up = latest_data['upstream'].values mean_1d_up = np.mean(recent_values_up[-24:]) mean_3d_up = np.mean(recent_values_up[-72:]) std_1d_up = np.std(recent_values_up[-24:]) max_1d_up = np.max(recent_values_up[-24:]) min_1d_up = np.min(recent_values_up[-24:]) # 统计特征 - 下游 recent_values_down = latest_data['downstream'].values mean_1d_down = np.mean(recent_values_down[-24:]) mean_3d_down = np.mean(recent_values_down[-72:]) std_1d_down = np.std(recent_values_down[-24:]) max_1d_down = np.max(recent_values_down[-24:]) min_1d_down = np.min(recent_values_down[-24:]) # 延迟特征 - 向量化方式 delay_hours = [1, 2, 3, 4, 6, 12, 24, 36, 48, 60, 72, 84, 96, 108, 120] delay_features_up = [] delay_features_down = [] # 更高效的方式查找延迟特征 for hours_ago in delay_hours: target_date = start_time - timedelta(hours=hours_ago) idx = np.abs(latest_data['DateTime'].values - target_date.to_numpy().astype('datetime64[ns]')).argmin() delay_features_up.append(latest_data['upstream'].iloc[idx]) delay_features_down.append(latest_data['downstream'].iloc[idx]) # 计算最近变化率 - 上游 recent_changes_up = [] for i in range(1, 6): if len(recent_values_up) > i: change = (recent_values_up[-1] - recent_values_up[-1-i]) / i recent_changes_up.append(change) else: recent_changes_up.append(0) # 计算最近变化率 - 下游 recent_changes_down = [] for i in range(1, 6): if len(recent_values_down) > i: change = (recent_values_down[-1] - recent_values_down[-1-i]) / i recent_changes_down.append(change) else: recent_changes_down.append(0) # 计算最近加速度 - 上游 recent_accelerations_up = [] for i in range(2, 6): if len(recent_values_up) > i: acc = ((recent_values_up[-1] - recent_values_up[-2]) - (recent_values_up[-2] - recent_values_up[-i])) / (i-1) recent_accelerations_up.append(acc) else: recent_accelerations_up.append(0) # 计算最近加速度 - 下游 recent_accelerations_down = [] for i in range(2, 6): if len(recent_values_down) > i: acc = ((recent_values_down[-1] - recent_values_down[-2]) - (recent_values_down[-2] - recent_values_down[-i])) / (i-1) recent_accelerations_down.append(acc) else: recent_accelerations_down.append(0) # 计算下游趋势 downstream_trend = np.polyfit(range(12), recent_values_down[-12:], 1)[0] # 合并特征 - 使用NumPy连接更快 latest_features = np.concatenate([ latest_window_up, # 上游时间窗口数据 latest_window_down, # 下游最近24小时数据 [hour, day_of_week, month], # 时间特征 [mean_1d_up, mean_3d_up, std_1d_up, max_1d_up, min_1d_up], # 上游统计特征 [mean_1d_down, mean_3d_down, std_1d_down, max_1d_down, min_1d_down], # 下游统计特征 delay_features_up, # 上游延迟特征 delay_features_down, # 下游延迟特征 recent_changes_up, # 上游最近变化率 recent_changes_down, # 下游最近变化率 recent_accelerations_up, # 上游最近加速度 recent_accelerations_down, # 下游最近加速度 [downstream_trend] # 下游趋势特征 ]).reshape(1, -1) # 打印特征维度,帮助诊断 print(f"预测特征维度: {latest_features.shape}, 模型期望维度: {len(feature_columns) if feature_columns else '未知'}") # 检查特征是否有效 if np.isnan(latest_features).any() or np.isinf(latest_features).any(): print("预测特征包含无效值") # 替换无效值 latest_features = np.nan_to_num(latest_features, nan=0.0, posinf=1e6, neginf=-1e6) # 预测 print("执行预测...") predictions = model.predict(latest_features) # 生成未来日期 future_dates = [start_time + timedelta(days=i) for i in range(5)] print("预测成功完成") return future_dates, predictions.flatten(), model except Exception as e: print(f"预测过程发生错误: {e}") return None, None, None # GUI 界面 def run_gui(): # 配置tkinter中文显示 def configure_gui_fonts(): # 尝试设置支持中文的字体 font_names = ['微软雅黑', 'Microsoft YaHei', 'SimSun', 'SimHei'] for font_name in font_names: try: default_font = tk.font.nametofont("TkDefaultFont") default_font.configure(family=font_name) text_font = tk.font.nametofont("TkTextFont") text_font.configure(family=font_name) fixed_font = tk.font.nametofont("TkFixedFont") fixed_font.configure(family=font_name) return True except: continue return False def on_predict(): try: predict_start_time = time() status_label.config(text="预测中...") root.update() start_time = pd.to_datetime(entry.get()) # 检查模型缓存情况 cache_exists = os.path.exists('salinity_model.pkl') if cache_exists and not retrain_var.get(): try: with open('salinity_model.pkl', 'rb') as f: model_data = pickle.load(f) # 检查模型特征数量是否一致 model_features = model_data.get('feature_columns', []) expected_features = ([f'upstream_t-{i}' for i in range(96, 0, -1)] + ['hour', 'day_of_week', 'month'] + ['mean_1d', 'mean_3d', 'std_1d', 'max_1d', 'min_1d'] + [f'delay_{h}h' for h in [60, 72, 84, 96, 108, 120]]) if len(model_features) != len(expected_features): status_label.config(text="特征结构已更改,请勾选'强制重新训练模型'") return except: pass force_retrain = retrain_var.get() future_dates, predictions, model = train_and_predict(df, start_time, force_retrain) if future_dates is None or predictions is None: status_label.config(text="预测失败") return # 清空之前的图形 ax.clear() # 绘制历史数据 history_end = min(start_time, df['DateTime'].max()) history_start = history_end - timedelta(days=120) # 显示近30天的历史数据 history_data = df[(df['DateTime'] >= history_start) & (df['DateTime'] <= history_end)] # 绘制一取水(下游)的历史数据 ax.plot(history_data['DateTime'], history_data['downstream'], label='一取水(下游)盐度', color='blue', linewidth=1.5) # 绘制青龙港(上游)的历史数据 - 使用平滑后的数据 ax.plot(history_data['DateTime'], history_data['upstream_smooth'], label='青龙港(上游)盐度', color='purple', linewidth=1.5, alpha=0.7) # 绘制青龙湖的历史数据 - 使用平滑后的数据 if 'qinglong_lake_smooth' in history_data.columns: ax.plot(history_data['DateTime'], history_data['qinglong_lake_smooth'], label='青龙湖盐度', color='green', linewidth=1.5, alpha=0.7) # 获取预测期间的真实值(如果有) actual_data = df[(df['DateTime'] >= start_time) & (df['DateTime'] <= future_dates[-1])] # 绘制预测数据 ax.plot(future_dates, predictions, marker='o', linestyle='--', label='预测盐度', color='red', linewidth=2) # 如果有真实值,绘制真实值 if not actual_data.empty: ax.plot(actual_data['DateTime'], actual_data['downstream'], marker='s', linestyle='-', label='真实盐度', color='orange', linewidth=2) # 计算预测误差 # 找到最接近预测日期的实际值 actual_values = [] for pred_date in future_dates: # 找到最接近的日期 closest_idx = (actual_data['DateTime'] - pred_date).abs().idxmin() actual_values.append(actual_data.loc[closest_idx, 'downstream']) if len(actual_values) == len(predictions): mse = np.mean((np.array(actual_values) - predictions) ** 2) rmse = np.sqrt(mse) mae = np.mean(np.abs(np.array(actual_values) - predictions)) # 在图上显示误差指标 error_text = f"RMSE: {rmse:.2f}, MAE: {mae:.2f}" ax.text(0.02, 0.05, error_text, transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.8)) # 添加置信区间 std_dev = history_data['downstream'].std() * 0.5 # 使用历史数据的标准差作为预测不确定性的估计 ax.fill_between(future_dates, predictions - std_dev, predictions + std_dev, color='red', alpha=0.2) # 设置标题、标签和图例 ax.set_xlabel('日期') ax.set_ylabel('盐度') ax.set_title(f"从 {start_time.strftime('%Y-%m-%d %H:%M:%S')} 开始的盐度预测") ax.legend(loc='upper left') # 调整布局,防止标签被遮挡 fig.tight_layout() # 更新画布显示 canvas.draw() # 计算预测总耗时 predict_time = time() - predict_start_time # 更新状态 status_label.config(text=f"预测完成 (耗时: {predict_time:.2f}秒)") # 展示预测结果 result_text = "预测结果:\n" for i, (date, pred) in enumerate(zip(future_dates, predictions)): result_text += f"第 {i+1} 天 ({date.strftime('%Y-%m-%d')}): {pred:.2f}\n" # 如果有真实值,添加到结果中 if not actual_data.empty and i < len(actual_values): result_text += f" 实际盐度: {actual_values[i]:.2f}\n" result_text += f" 误差: {abs(actual_values[i] - pred):.2f}\n" result_label.config(text=result_text) except Exception as e: status_label.config(text=f"错误: {str(e)}") def on_scroll(event): # 获取当前坐标轴的范围 xlim = ax.get_xlim() ylim = ax.get_ylim() # 设置滚轮缩放的增量 zoom_factor = 1.1 # 确定鼠标位置到轴的相对位置 x_data = event.xdata y_data = event.ydata # 如果鼠标不在坐标轴内,则使用轴中心 if x_data is None: x_data = (xlim[0] + xlim[1]) / 2 if y_data is None: y_data = (ylim[0] + ylim[1]) / 2 # 计算相对位置 x_rel = (x_data - xlim[0]) / (xlim[1] - xlim[0]) y_rel = (y_data - ylim[0]) / (ylim[1] - ylim[0]) # 检查滚轮的滚动方向 - 使用event.step替代event.button,更准确 # 向上滚动(放大)为正值,向下滚动(缩小)为负值 if event.step > 0: # 向上滚动 = 放大 # 计算新的区间 new_width = (xlim[1] - xlim[0]) / zoom_factor new_height = (ylim[1] - ylim[0]) / zoom_factor # 计算新的区间边界,保持鼠标位置相对不变 x0 = x_data - x_rel * new_width y0 = y_data - y_rel * new_height x1 = x0 + new_width y1 = y0 + new_height ax.set_xlim([x0, x1]) ax.set_ylim([y0, y1]) else: # 向下滚动 = 缩小 # 计算新的区间 new_width = (xlim[1] - xlim[0]) * zoom_factor new_height = (ylim[1] - ylim[0]) * zoom_factor # 计算新的区间边界,保持鼠标位置相对不变 x0 = x_data - x_rel * new_width y0 = y_data - y_rel * new_height x1 = x0 + new_width y1 = y0 + new_height ax.set_xlim([x0, x1]) ax.set_ylim([y0, y1]) # 更新画布显示 canvas.draw_idle() # 定义鼠标拖动功能 def on_mouse_press(event): if event.button == 1: # 左键 canvas.mpl_disconnect(hover_cid) canvas._pan_start = (event.x, event.y) canvas._xlim = ax.get_xlim() canvas._ylim = ax.get_ylim() canvas.mpl_connect('motion_notify_event', on_mouse_move) def on_mouse_release(event): if event.button == 1: # 左键 canvas.mpl_disconnect(move_cid[0]) global hover_cid hover_cid = canvas.mpl_connect('motion_notify_event', update_cursor) def on_mouse_move(event): if event.button == 1 and hasattr(canvas, '_pan_start'): dx = event.x - canvas._pan_start[0] dy = event.y - canvas._pan_start[1] # 转换像素移动到数据坐标移动 x_span = canvas._xlim[1] - canvas._xlim[0] y_span = canvas._ylim[1] - canvas._ylim[0] # 转换因子(用于将像素移动转换为数据范围移动) width, height = canvas.get_width_height() x_scale = x_span / width y_scale = y_span / height # 计算新的限制 xlim = [canvas._xlim[0] - dx * x_scale, canvas._xlim[1] - dx * x_scale] ylim = [canvas._ylim[0] + dy * y_scale, canvas._ylim[1] + dy * y_scale] # 设置新的限制 ax.set_xlim(xlim) ax.set_ylim(ylim) canvas.draw_idle() # 更新鼠标指针样式 def update_cursor(event): if event.inaxes == ax: canvas.get_tk_widget().config(cursor="fleur") # 手形光标表示可拖动 else: canvas.get_tk_widget().config(cursor="") # 重置视图 def reset_view(): display_history() status_label.config(text="图表视图已重置") root = tk.Tk() root.title("青龙港-陈行盐度预测系统") # 修改为中文标题 # 尝试配置中文字体 try: import tkinter.font as tkfont configure_gui_fonts() except: print("无法配置GUI字体,可能影响中文显示") # 创建框架 input_frame = ttk.Frame(root, padding="10") input_frame.pack(fill=tk.X) control_frame = ttk.Frame(root, padding="5") control_frame.pack(fill=tk.X) result_frame = ttk.Frame(root, padding="10") result_frame.pack(fill=tk.BOTH, expand=True) # 输入框和预测按钮 ttk.Label(input_frame, text="输入开始时间 (YYYY-MM-DD HH:MM:SS)").pack(side=tk.LEFT) entry = ttk.Entry(input_frame, width=25) entry.pack(side=tk.LEFT, padx=5) predict_button = ttk.Button(input_frame, text="预测", command=on_predict) predict_button.pack(side=tk.LEFT) # 状态标签 status_label = ttk.Label(input_frame, text="提示: 第一次运行请勾选'强制重新训练模型'") status_label.pack(side=tk.LEFT, padx=10) # 控制选项 retrain_var = tk.BooleanVar(value=False) ttk.Checkbutton(control_frame, text="强制重新训练模型", variable=retrain_var).pack(side=tk.LEFT) # 添加图例说明 legend_label = ttk.Label(control_frame, text="图例: 紫色=青龙港上游数据, 蓝色=一取水下游数据, 红色=预测值, 绿色=实际值") legend_label.pack(side=tk.LEFT, padx=10) # 结果标签 result_label = ttk.Label(result_frame, text="", justify=tk.LEFT) result_label.pack(side=tk.RIGHT, fill=tk.Y) # 绘图区域 - 设置dpi提高清晰度 fig, ax = plt.subplots(figsize=(10, 5), dpi=100) # 创建画布并添加工具栏 canvas = FigureCanvasTkAgg(fig, master=result_frame) canvas.get_tk_widget().pack(side=tk.LEFT, fill=tk.BOTH, expand=True) # 添加工具栏 toolbar_frame = ttk.Frame(result_frame) toolbar_frame.pack(side=tk.BOTTOM, fill=tk.X) toolbar = NavigationToolbar2Tk(canvas, toolbar_frame) toolbar.update() # 添加自定义重置按钮 reset_button = ttk.Button(control_frame, text="重置视图", command=reset_view) reset_button.pack(side=tk.LEFT, padx=5) # 连接鼠标事件 canvas.mpl_connect('button_press_event', on_mouse_press) canvas.mpl_connect('button_release_event', on_mouse_release) canvas.mpl_connect('scroll_event', on_scroll) # 全局变量,用于存储事件连接ID move_cid = [None] hover_cid = canvas.mpl_connect('motion_notify_event', update_cursor) # 默认加载历史数据 def display_history(): # 清空之前的图形 ax.clear() # 确保显示全部历史数据但不超过60天 end_date = df['DateTime'].max() start_date = max(df['DateTime'].min(), end_date - timedelta(days=60)) display_data = df[(df['DateTime'] >= start_date) & (df['DateTime'] <= end_date)] # 绘制一取水(下游)历史数据 ax.plot(display_data['DateTime'], display_data['downstream'], label='一取水(下游)盐度', color='blue', linewidth=1.5) # 绘制青龙港(上游)的历史数据 - 使用平滑后的数据 ax.plot(display_data['DateTime'], display_data['upstream_smooth'], label='青龙港(上游)盐度', color='purple', linewidth=1.5, alpha=0.7) # 绘制青龙港历史数据 - 使用平滑后的数据 if 'qinglong_lake_smooth' in display_data.columns: ax.plot(display_data['DateTime'], display_data['qinglong_lake_smooth'], label='青龙湖盐度', color='green', linewidth=1.5, alpha=0.7) # 设置标题、标签和图例 ax.set_xlabel('日期') ax.set_ylabel('盐度') ax.set_title('历史盐度数据对比') ax.legend() # 调整布局,防止标签被遮挡 fig.tight_layout() # 更新画布显示 canvas.draw() # 默认加载历史数据 display_history() # 启动GUI root.mainloop() # 运行 df = load_data('青龙港1.csv', '一取水.csv', ) # 如果数据加载成功,则运行GUI界面 if df is not None: run_gui() else: print("数据加载失败,无法运行预测。")