import os import pickle import pandas as pd import numpy as np import tkinter as tk import tkinter.font as tkfont from tkinter import ttk from datetime import timedelta from time import time import matplotlib.pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk from xgboost import XGBRegressor from lunardate import LunarDate from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error import matplotlib # 配置 matplotlib 中文显示 matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS'] matplotlib.rcParams['axes.unicode_minus'] = False matplotlib.rcParams['font.family'] = 'sans-serif' # 全局缓存变量及特征名称(此处 feature_columns 仅为占位) cached_model = None last_training_time = None feature_columns = None # ------------------------------- # 数据加载与预处理函数 # ------------------------------- def load_data(upstream_file, downstream_file, qinglong_lake_file=None): try: upstream_df = pd.read_csv(upstream_file) downstream_df = pd.read_csv(downstream_file) if qinglong_lake_file: qinglong_lake_df = pd.read_csv(qinglong_lake_file) except FileNotFoundError: print("文件未找到,请检查路径") return None # 假设原始数据列依次为 ['DateTime', 'TagName', 'Value'] upstream_df.columns = ['DateTime', 'TagName', 'Value'] downstream_df.columns = ['DateTime', 'TagName', 'Value'] if qinglong_lake_file: qinglong_lake_df.columns = ['DateTime', 'TagName', 'Value'] # 转换时间格式及数值处理 upstream_df['DateTime'] = pd.to_datetime(upstream_df['DateTime']) downstream_df['DateTime'] = pd.to_datetime(downstream_df['DateTime']) if qinglong_lake_file: qinglong_lake_df['DateTime'] = pd.to_datetime(qinglong_lake_df['DateTime']) upstream_df['Value'] = pd.to_numeric(upstream_df['Value'], errors='coerce') downstream_df['Value'] = pd.to_numeric(downstream_df['Value'], errors='coerce') if qinglong_lake_file: qinglong_lake_df['Value'] = pd.to_numeric(qinglong_lake_df['Value'], errors='coerce') # 过滤盐度小于5的数据 upstream_df = upstream_df[upstream_df['Value'] >= 5] downstream_df = downstream_df[downstream_df['Value'] >= 5] if qinglong_lake_file: qinglong_lake_df = qinglong_lake_df[qinglong_lake_df['Value'] >= 5] # 将0替换为NaN,并利用3倍标准差法处理异常值 for df in [upstream_df, downstream_df]: df.loc[df['Value'] == 0, 'Value'] = np.nan mean_val, std_val = df['Value'].mean(), df['Value'].std() lower_bound, upper_bound = mean_val - 3 * std_val, mean_val + 3 * std_val df.loc[(df['Value'] < lower_bound) | (df['Value'] > upper_bound), 'Value'] = np.nan if qinglong_lake_file: qinglong_lake_df.loc[qinglong_lake_df['Value'] == 0, 'Value'] = np.nan mean_val, std_val = qinglong_lake_df['Value'].mean(), qinglong_lake_df['Value'].std() lower_bound, upper_bound = mean_val - 3 * std_val, mean_val + 3 * std_val qinglong_lake_df.loc[(qinglong_lake_df['Value'] < lower_bound) | (qinglong_lake_df['Value'] > upper_bound), 'Value'] = np.nan # 重命名 Value 列并保留需要的列 upstream_df = upstream_df.rename(columns={'Value': 'upstream'})[['DateTime', 'upstream']] downstream_df = downstream_df.rename(columns={'Value': 'downstream'})[['DateTime', 'downstream']] if qinglong_lake_file: qinglong_lake_df = qinglong_lake_df.rename(columns={'Value': 'qinglong_lake'})[['DateTime', 'qinglong_lake']] # 合并数据 merged_df = pd.merge(upstream_df, downstream_df, on='DateTime', how='inner') if qinglong_lake_file: merged_df = pd.merge(merged_df, qinglong_lake_df, on='DateTime', how='left') print(f"合并前数据行数: {len(merged_df)}") merged_df = merged_df.set_index('DateTime') # 插值:先用线性,再用时间插值,最后用前向后向填充 merged_df['upstream'] = merged_df['upstream'].interpolate(method='linear', limit=4) merged_df['downstream'] = merged_df['downstream'].interpolate(method='linear', limit=4) if qinglong_lake_file: merged_df['qinglong_lake'] = merged_df['qinglong_lake'].interpolate(method='linear', limit=4) merged_df['upstream'] = merged_df['upstream'].interpolate(method='time', limit=24) merged_df['downstream'] = merged_df['downstream'].interpolate(method='time', limit=24) if qinglong_lake_file: merged_df['qinglong_lake'] = merged_df['qinglong_lake'].interpolate(method='time', limit=24) merged_df['upstream'] = merged_df['upstream'].fillna(method='ffill').fillna(method='bfill') merged_df['downstream'] = merged_df['downstream'].fillna(method='ffill').fillna(method='bfill') if qinglong_lake_file: merged_df['qinglong_lake'] = merged_df['qinglong_lake'].fillna(method='ffill').fillna(method='bfill') # 平滑处理:使用滑动窗口移动平均 merged_df['upstream_smooth'] = merged_df['upstream'].rolling(window=24, min_periods=1, center=True).mean() merged_df['downstream_smooth'] = merged_df['downstream'].rolling(window=24, min_periods=1, center=True).mean() if qinglong_lake_file: merged_df['qinglong_lake_smooth'] = merged_df['qinglong_lake'].rolling(window=24, min_periods=1, center=True).mean() # 对低盐度部分用更大窗口平滑 low_sal_mask = merged_df['upstream'] < 50 if low_sal_mask.any(): merged_df.loc[low_sal_mask, 'upstream_smooth'] = merged_df.loc[low_sal_mask, 'upstream']\ .rolling(window=48, min_periods=1, center=True).mean() merged_df = merged_df.dropna() merged_df = merged_df[merged_df['upstream'].apply(np.isfinite)] merged_df = merged_df[merged_df['downstream'].apply(np.isfinite)] if qinglong_lake_file: merged_df = merged_df[merged_df['qinglong_lake'].apply(np.isfinite)] merged_df = merged_df.reset_index() print(f"清洗后数据行数: {len(merged_df)}") print(f"上游盐度范围: {merged_df['upstream'].min()} - {merged_df['upstream'].max()}") print(f"下游盐度范围: {merged_df['downstream'].min()} - {merged_df['downstream'].max()}") if qinglong_lake_file: print(f"青龙湖盐度范围: {merged_df['qinglong_lake'].min()} - {merged_df['qinglong_lake'].max()}") merged_df = merged_df.sort_values('DateTime') return merged_df # ------------------------------- # 添加农历(潮汐)特征 # ------------------------------- def add_lunar_features(df): lunar_day, lunar_phase_sin, lunar_phase_cos, is_high_tide = [], [], [], [] for dt in df['DateTime']: ld = LunarDate.fromSolarDate(dt.year, dt.month, dt.day) lunar_day.append(ld.day) lunar_phase_sin.append(np.sin(2 * np.pi * ld.day / 15)) lunar_phase_cos.append(np.cos(2 * np.pi * ld.day / 15)) is_high_tide.append(1 if (ld.day <= 5 or (ld.day >= 16 and ld.day <= 20)) else 0) df['lunar_day'] = lunar_day df['lunar_phase_sin'] = lunar_phase_sin df['lunar_phase_cos'] = lunar_phase_cos df['is_high_tide'] = is_high_tide return df # ------------------------------- # 批量生成延迟特征(向量化,利用 shift) # ------------------------------- def batch_create_delay_features(df, delay_hours): for delay in delay_hours: df[f'upstream_delay_{delay}h'] = df['upstream'].shift(delay) df[f'downstream_delay_{delay}h'] = df['downstream'].shift(delay) return df # ------------------------------- # 向量化构造训练样本(优化特征工程) # ------------------------------- def create_features_vectorized(df, look_back=96, forecast_horizon=5): """ 利用 numpy 的 sliding_window_view 对历史窗口、下游窗口、标签进行批量切片, 其他特征(时间、农历、统计、延迟特征)直接批量读取后拼接 """ # 这里定义 total_samples 为: total_samples = len(df) - look_back - forecast_horizon + 1 if total_samples <= 0: print("数据不足以创建特征") return np.array([]), np.array([]) # 利用 sliding_window_view 构造历史窗口(上游连续 look_back 个数据) upstream_array = df['upstream'].values # shape (n,) # 滑动窗口,结果 shape (n - look_back + 1, look_back) from numpy.lib.stride_tricks import sliding_window_view window_up = sliding_window_view(upstream_array, window_shape=look_back)[:total_samples, :] # 下游最近 24 小时:利用滑动窗口构造,窗口大小为 24 downstream_array = df['downstream'].values window_down_full = sliding_window_view(downstream_array, window_shape=24) # 对于标签和下游窗口,原逻辑:取 df['downstream'].iloc[i+look_back-24:i+look_back] # 则对应索引为 i+look_back-24, i 从 0 到 total_samples-1 window_down = window_down_full[look_back-24 : look_back-24 + total_samples, :] # 时间特征与农历特征等:取样区间为 df.iloc[look_back: len(df)-forecast_horizon+1] sample_df = df.iloc[look_back: len(df)-forecast_horizon+1].copy() basic_time = sample_df['DateTime'].dt.hour.values.reshape(-1, 1) / 24.0 weekday = sample_df['DateTime'].dt.dayofweek.values.reshape(-1, 1) / 7.0 month = sample_df['DateTime'].dt.month.values.reshape(-1, 1) / 12.0 basic_time_feats = np.hstack([basic_time, weekday, month]) lunar_feats = sample_df[['lunar_phase_sin','lunar_phase_cos','is_high_tide']].values # 统计特征(预先利用 rolling 已计算好,注意取出对应行) try: stats_up = sample_df[['mean_1d_up','mean_3d_up','std_1d_up','max_1d_up','min_1d_up']].values stats_down = sample_df[['mean_1d_down','mean_3d_down','std_1d_down','max_1d_down','min_1d_down']].values except KeyError as e: print(f"统计特征列不存在: {e},请确保先计算统计特征") return np.array([]), np.array([]) # 延迟特征:假设所有延迟特征列名均以 "upstream_delay_" 或 "downstream_delay_" 开头 delay_cols = [col for col in sample_df.columns if col.startswith('upstream_delay_') or col.startswith('downstream_delay_')] delay_feats = sample_df[delay_cols].values # 拼接所有特征:先将历史窗口(window_up)与下游窗口(window_down)拼接,再拼接其他特征 X = np.hstack([window_up, window_down, basic_time_feats, lunar_feats, stats_up, stats_down, delay_feats]) # 构造标签:利用滑动窗口构造 forecast_horizon 内的下游数据 label_full = sliding_window_view(downstream_array, window_shape=forecast_horizon) # 标签区间对应从 index = look_back 到 look_back + total_samples y = label_full[look_back: look_back + total_samples, :] global feature_columns feature_columns = ["combined_vector_features"] print(f"向量化特征工程完成,有效样本数: {X.shape[0]}") return X, y # ------------------------------- # 获取模型准确度指标 # ------------------------------- def get_model_metrics(): """获取保存在模型缓存中的准确度指标""" model_cache_file = 'salinity_model.pkl' if os.path.exists(model_cache_file): try: with open(model_cache_file, 'rb') as f: model_data = pickle.load(f) return { 'rmse': model_data.get('rmse', None), 'mae': model_data.get('mae', None) } except Exception as e: print(f"获取模型指标失败: {e}") return None # ------------------------------- # 模型训练与预测,展示验证准确度(RMSE, MAE) # ------------------------------- def train_and_predict(df, start_time, force_retrain=False): global cached_model, last_training_time model_cache_file = 'salinity_model.pkl' model_needs_training = True if os.path.exists(model_cache_file) and force_retrain: try: os.remove(model_cache_file) print("已删除旧模型缓存(强制重新训练)") except Exception as e: print("删除缓存异常:", e) train_df = df[df['DateTime'] < start_time].copy() if not force_retrain and cached_model is not None and last_training_time is not None: if last_training_time >= train_df['DateTime'].max(): model_needs_training = False print(f"使用缓存模型,训练时间: {last_training_time}") elif not force_retrain and os.path.exists(model_cache_file): try: with open(model_cache_file, 'rb') as f: model_data = pickle.load(f) cached_model = model_data['model'] last_training_time = model_data['training_time'] if last_training_time >= train_df['DateTime'].max(): model_needs_training = False print(f"从文件加载模型,训练时间: {last_training_time}") except Exception as e: print("加载模型失败:", e) if model_needs_training: print("开始训练新模型...") if len(train_df) < 100: print("训练数据不足") return None, None, None, None start_train = time() X, y = create_features_vectorized(train_df, look_back=96, forecast_horizon=5) if len(X) == 0 or len(y) == 0: print("样本生成不足,训练终止") return None, None, None, None print(f"训练样本数量: {X.shape[0]}") X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) model = XGBRegressor( n_estimators=300, learning_rate=0.03, max_depth=5, min_child_weight=2, subsample=0.85, colsample_bytree=0.85, gamma=0.1, reg_alpha=0.2, reg_lambda=1.5, n_jobs=-1, random_state=42 ) try: model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', early_stopping_rounds=20, verbose=False) # 在验证集上计算 RMSE 和 MAE y_val_pred = model.predict(X_val) rmse = np.sqrt(mean_squared_error(y_val, y_val_pred)) mae = mean_absolute_error(y_val, y_val_pred) print(f"验证集 RMSE: {rmse:.4f}, MAE: {mae:.4f}") last_training_time = start_time cached_model = model with open(model_cache_file, 'wb') as f: pickle.dump({ 'model': model, 'training_time': last_training_time, 'feature_columns': feature_columns, 'rmse': rmse, 'mae': mae }, f) print(f"模型训练完成,耗时: {time() - start_train:.2f}秒") except Exception as e: print("模型训练异常:", e) return None, None, None, None else: model = cached_model # 预测部分:构造单个预测样本(与训练时特征构造一致) try: # 这里采用与 create_features_vectorized 类似的思路构造预测样本 # 取最近数据足够构成历史窗口和其他特征 n = len(df) if n < 96 + 5: print("预测数据不足") return None, None, None, None # 使用 sliding_window_view 构造最新的上游和下游窗口 upstream_array = df['upstream'].values window_up = np.lib.stride_tricks.sliding_window_view(upstream_array, window_shape=96)[-1, :] downstream_array = df['downstream'].values window_down = np.lib.stride_tricks.sliding_window_view(downstream_array, window_shape=24)[-1, :] # 时间特征和农历特征基于当前预测开始时刻 hour_norm = start_time.hour / 24.0 weekday_norm = start_time.dayofweek / 7.0 month_norm = start_time.month / 12.0 basic_time_feats = np.array([hour_norm, weekday_norm, month_norm]).reshape(1, -1) ld = LunarDate.fromSolarDate(start_time.year, start_time.month, start_time.day) lunar_feats = np.array([np.sin(2*np.pi*ld.day/15), np.cos(2*np.pi*ld.day/15), 1 if (ld.day <=5 or (ld.day >=16 and ld.day<=20)) else 0]).reshape(1, -1) # 统计特征:用最新 24/72 小时数据(取末尾24/72) try: # 优先使用DataFrame中已计算的统计特征 stats_up = df[['mean_1d_up','mean_3d_up','std_1d_up','max_1d_up','min_1d_up']].iloc[-1:].values stats_down = df[['mean_1d_down','mean_3d_down','std_1d_down','max_1d_down','min_1d_down']].iloc[-1:].values except KeyError: # 如果不存在,则直接计算 recent_up = df['upstream'].values[-24:] stats_up = np.array([np.mean(recent_up), np.mean(df['upstream'].values[-72:]), np.std(recent_up), np.max(recent_up), np.min(recent_up)]).reshape(1, -1) recent_down = df['downstream'].values[-24:] stats_down = np.array([np.mean(recent_down), np.mean(df['downstream'].values[-72:]), np.std(recent_down), np.max(recent_down), np.min(recent_down)]).reshape(1, -1) # 延迟特征:直接从最后一行延迟特征取值 delay_cols = [col for col in df.columns if col.startswith('upstream_delay_') or col.startswith('downstream_delay_')] delay_feats = df[delay_cols].iloc[-1:].values # shape (1, ?) # 拼接所有预测特征 X_pred = np.hstack([window_up.reshape(1, -1), window_down.reshape(1, -1), basic_time_feats, lunar_feats, stats_up, stats_down, delay_feats]) if np.isnan(X_pred).any() or np.isinf(X_pred).any(): X_pred = np.nan_to_num(X_pred, nan=0.0, posinf=1e6, neginf=-1e6) predictions = model.predict(X_pred) # 生成未来日期标签(预测未来 5 天) future_dates = [start_time + timedelta(days=i) for i in range(5)] print("预测完成") # 获取模型指标 metrics = None if os.path.exists(model_cache_file): try: with open(model_cache_file, 'rb') as f: model_data = pickle.load(f) metrics = { 'rmse': model_data.get('rmse', None), 'mae': model_data.get('mae', None) } except Exception as e: print(f"获取模型指标失败: {e}") return future_dates, predictions.flatten(), model, metrics except Exception as e: print("预测过程异常:", e) return None, None, None, None # ------------------------------- # GUI界面部分 # ------------------------------- def run_gui(): def configure_gui_fonts(): font_names = ['微软雅黑', 'Microsoft YaHei', 'SimSun', 'SimHei'] for font_name in font_names: try: default_font = tkfont.nametofont("TkDefaultFont") default_font.configure(family=font_name) text_font = tkfont.nametofont("TkTextFont") text_font.configure(family=font_name) fixed_font = tkfont.nametofont("TkFixedFont") fixed_font.configure(family=font_name) return True except Exception as e: continue return False def on_predict(): try: predict_start = time() status_label.config(text="预测中...") root.update() start_time_dt = pd.to_datetime(entry.get()) force_retrain = retrain_var.get() future_dates, predictions, model, metrics = train_and_predict(df, start_time_dt, force_retrain) if future_dates is None or predictions is None: status_label.config(text="预测失败") return # 获取并显示模型准确度指标 if metrics: metrics_text = f"模型准确度 - RMSE: {metrics['rmse']:.4f}, MAE: {metrics['mae']:.4f}" metrics_label.config(text=metrics_text) ax.clear() # 绘制历史数据(最近 120 天) history_end = min(start_time_dt, df['DateTime'].max()) history_start = history_end - timedelta(days=120) hist_data = df[(df['DateTime'] >= history_start) & (df['DateTime'] <= history_end)] ax.plot(hist_data['DateTime'], hist_data['downstream'], label='一取水(下游)盐度', color='blue', linewidth=1.5) ax.plot(hist_data['DateTime'], hist_data['upstream_smooth'], label='青龙港(上游)盐度', color='purple', linewidth=1.5, alpha=0.7) if 'qinglong_lake_smooth' in hist_data.columns: ax.plot(hist_data['DateTime'], hist_data['qinglong_lake_smooth'], label='青龙湖盐度', color='green', linewidth=1.5, alpha=0.7) ax.plot(future_dates, predictions, marker='o', linestyle='--', label='预测盐度', color='red', linewidth=2) actual_data = df[(df['DateTime'] >= start_time_dt) & (df['DateTime'] <= future_dates[-1])] if not actual_data.empty: ax.plot(actual_data['DateTime'], actual_data['downstream'], marker='s', linestyle='-', label='实际盐度', color='orange', linewidth=2) std_dev = hist_data['downstream'].std() * 0.5 ax.fill_between(future_dates, predictions - std_dev, predictions + std_dev, color='red', alpha=0.2) ax.set_xlabel('日期') ax.set_ylabel('盐度') ax.set_title(f"从 {start_time_dt.strftime('%Y-%m-%d %H:%M:%S')} 开始的盐度预测") ax.legend(loc='upper left') fig.tight_layout() canvas.draw() predict_time = time() - predict_start status_label.config(text=f"预测完成 (耗时: {predict_time:.2f}秒)") result_text = "预测结果:\n" for i, (date, pred) in enumerate(zip(future_dates, predictions)): result_text += f"第 {i+1} 天 ({date.strftime('%Y-%m-%d')}): {pred:.2f}\n" result_label.config(text=result_text) except Exception as e: status_label.config(text=f"错误: {str(e)}") def on_scroll(event): xlim = ax.get_xlim() ylim = ax.get_ylim() zoom_factor = 1.1 x_data = event.xdata if event.xdata is not None else (xlim[0]+xlim[1])/2 y_data = event.ydata if event.ydata is not None else (ylim[0]+ylim[1])/2 x_rel = (x_data - xlim[0]) / (xlim[1] - xlim[0]) y_rel = (y_data - ylim[0]) / (ylim[1] - ylim[0]) if event.step > 0: new_width = (xlim[1]-xlim[0]) / zoom_factor new_height = (ylim[1]-ylim[0]) / zoom_factor x0 = x_data - x_rel * new_width y0 = y_data - y_rel * new_height ax.set_xlim([x0, x0+new_width]) ax.set_ylim([y0, y0+new_height]) else: new_width = (xlim[1]-xlim[0]) * zoom_factor new_height = (ylim[1]-ylim[0]) * zoom_factor x0 = x_data - x_rel * new_width y0 = y_data - y_rel * new_height ax.set_xlim([x0, x0+new_width]) ax.set_ylim([y0, y0+new_height]) canvas.draw_idle() def update_cursor(event): if event.inaxes == ax: canvas.get_tk_widget().config(cursor="fleur") else: canvas.get_tk_widget().config(cursor="") def reset_view(): display_history() status_label.config(text="图表视图已重置") root = tk.Tk() root.title("青龙港-陈行盐度预测系统") try: configure_gui_fonts() except Exception as e: print("字体配置异常:", e) input_frame = ttk.Frame(root, padding="10") input_frame.pack(fill=tk.X) control_frame = ttk.Frame(root, padding="5") control_frame.pack(fill=tk.X) result_frame = ttk.Frame(root, padding="10") result_frame.pack(fill=tk.BOTH, expand=True) ttk.Label(input_frame, text="输入开始时间 (YYYY-MM-DD HH:MM:SS)").pack(side=tk.LEFT) entry = ttk.Entry(input_frame, width=25) entry.pack(side=tk.LEFT, padx=5) predict_button = ttk.Button(input_frame, text="预测", command=on_predict) predict_button.pack(side=tk.LEFT) status_label = ttk.Label(input_frame, text="提示: 第一次运行请勾选'强制重新训练模型'") status_label.pack(side=tk.LEFT, padx=10) retrain_var = tk.BooleanVar(value=False) ttk.Checkbutton(control_frame, text="强制重新训练模型", variable=retrain_var).pack(side=tk.LEFT) legend_label = ttk.Label(control_frame, text="图例: 紫色=青龙港上游数据, 蓝色=一取水下游数据, 红色=预测值, 绿色=实际值") legend_label.pack(side=tk.LEFT, padx=10) reset_button = ttk.Button(control_frame, text="重置视图", command=reset_view) reset_button.pack(side=tk.LEFT, padx=5) # 添加显示模型准确度的标签 metrics_frame = ttk.Frame(root, padding="5") metrics_frame.pack(fill=tk.X) model_metrics = get_model_metrics() metrics_text = "模型准确度: 未知" if not model_metrics else f"模型准确度 - RMSE: {model_metrics['rmse']:.4f}, MAE: {model_metrics['mae']:.4f}" metrics_label = ttk.Label(metrics_frame, text=metrics_text) metrics_label.pack(side=tk.LEFT, padx=10) result_label = ttk.Label(result_frame, text="", justify=tk.LEFT) result_label.pack(side=tk.RIGHT, fill=tk.Y) fig, ax = plt.subplots(figsize=(10,5), dpi=100) canvas = FigureCanvasTkAgg(fig, master=result_frame) canvas.get_tk_widget().pack(side=tk.LEFT, fill=tk.BOTH, expand=True) toolbar_frame = ttk.Frame(result_frame) toolbar_frame.pack(side=tk.BOTTOM, fill=tk.X) toolbar = NavigationToolbar2Tk(canvas, toolbar_frame) toolbar.update() canvas.mpl_connect('scroll_event', on_scroll) canvas.mpl_connect('motion_notify_event', update_cursor) def display_history(): ax.clear() end_date = df['DateTime'].max() start_date = max(df['DateTime'].min(), end_date - timedelta(days=60)) hist_data = df[(df['DateTime']>=start_date)&(df['DateTime']<=end_date)] ax.plot(hist_data['DateTime'], hist_data['downstream'], label='一取水(下游)盐度', color='blue', linewidth=1.5) ax.plot(hist_data['DateTime'], hist_data['upstream_smooth'], label='青龙港(上游)盐度', color='purple', linewidth=1.5, alpha=0.7) ax.set_xlabel('日期') ax.set_ylabel('盐度') ax.set_title('历史盐度数据对比') ax.legend() fig.tight_layout() canvas.draw() display_history() root.mainloop() # ------------------------------- # 主程序入口:加载数据、添加特征、生成延迟特征后启动GUI # ------------------------------- def save_processed_data(df, filename='processed_data.pkl'): try: df.to_pickle(filename) print(f"已保存处理后的数据到 {filename}") return True except Exception as e: print(f"保存数据失败: {e}") return False def load_processed_data(filename='processed_data.pkl'): try: if os.path.exists(filename): df = pd.read_pickle(filename) print(f"已从 {filename} 加载处理后的数据") return df else: print(f"找不到处理后的数据文件 {filename}") return None except Exception as e: print(f"加载数据失败: {e}") return None # 尝试加载处理后的数据,如果不存在则重新处理 processed_data = load_processed_data() if processed_data is not None: df = processed_data else: df = load_data('青龙港1.csv', '一取水.csv') if df is not None: df = add_lunar_features(df) delay_hours = [1,2,3,4,6,12,24,36,48,60,72,84,96,108,120] df = batch_create_delay_features(df, delay_hours) # 添加统计特征 df['mean_1d_up'] = df['upstream'].rolling(window=24, min_periods=1).mean() df['mean_3d_up'] = df['upstream'].rolling(window=72, min_periods=1).mean() df['std_1d_up'] = df['upstream'].rolling(window=24, min_periods=1).std() df['max_1d_up'] = df['upstream'].rolling(window=24, min_periods=1).max() df['min_1d_up'] = df['upstream'].rolling(window=24, min_periods=1).min() df['mean_1d_down'] = df['downstream'].rolling(window=24, min_periods=1).mean() df['mean_3d_down'] = df['downstream'].rolling(window=72, min_periods=1).mean() df['std_1d_down'] = df['downstream'].rolling(window=24, min_periods=1).std() df['max_1d_down'] = df['downstream'].rolling(window=24, min_periods=1).max() df['min_1d_down'] = df['downstream'].rolling(window=24, min_periods=1).min() # 保存处理后的数据 save_processed_data(df) if df is not None: run_gui() else: print("数据加载失败,无法运行预测。")