# xgboost修改版本
|
import os
|
import pickle
|
import pandas as pd
|
import numpy as np
|
from numpy.lib.stride_tricks import sliding_window_view
|
import tkinter as tk
|
import tkinter.font as tkfont
|
from tkinter import ttk
|
from datetime import timedelta
|
from time import time
|
import matplotlib.pyplot as plt
|
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
|
from xgboost import XGBRegressor
|
from lunardate import LunarDate
|
from sklearn.model_selection import train_test_split
|
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
import matplotlib
|
from scipy.signal import savgol_filter
|
import matplotlib.dates as mdates
|
from sklearn.preprocessing import StandardScaler
|
from sklearn.ensemble import RandomForestRegressor
|
from sklearn.linear_model import Ridge, Lasso
|
from sklearn.svm import SVR
|
from sklearn.neural_network import MLPRegressor
|
|
# 配置 matplotlib 中文显示
|
matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS']
|
matplotlib.rcParams['axes.unicode_minus'] = False
|
matplotlib.rcParams['font.family'] = 'sans-serif'
|
|
# 全局缓存变量及特征名称
|
cached_model = None
|
last_training_time = None
|
feature_columns = None
|
current_view = {'xlim': None, 'ylim': None} # 用于存储当前图表视图
|
prediction_mode = "青龙港-陈行" # 默认预测模式
|
current_df = None # 当前使用的数据集
|
|
|
|
# 定义改进的盐度数据异常过滤方法
|
def filter_salinity_anomalies(df, threshold_ratio=0.5, window_size=5, max_days=1):
|
# 复制数据,避免修改原始数据
|
filtered_df = df.copy()
|
|
# 确保能访问到日期信息(由于日期已设置为索引)
|
values = filtered_df['Value'].values
|
dates = filtered_df.index.values # 从索引获取日期
|
|
# 1. 首先处理单个异常点
|
i = 1
|
while i < len(values):
|
# 检查当前值是否小于前一个值的threshold_ratio
|
if values[i] < values[i-1] * threshold_ratio:
|
baseline = values[i-1] # 基准值为上一个正常的盐度值
|
anomaly_start = i
|
j = i
|
|
# 向后查找,直到找到一个不小于基准值threshold_ratio的点
|
# 或者直到时间区间超过max_days天
|
anomaly_start_date = dates[anomaly_start]
|
max_date = anomaly_start_date + np.timedelta64(int(max_days*24), 'h')
|
|
while j < len(values) and values[j] < baseline * threshold_ratio and dates[j] <= max_date:
|
j += 1
|
|
anomaly_end = j - 1 # 异常区间的结束位置
|
|
# 处理异常区间
|
if anomaly_end - anomaly_start < 3: # 短区间用线性插值
|
if j < len(values):
|
# 如果异常区间后还有数据点,使用线性插值
|
for k in range(anomaly_start, anomaly_end + 1):
|
# 线性插值:在基准值和异常区间后第一个正常值之间进行平滑过渡
|
ratio = (k - anomaly_start + 1) / (anomaly_end - anomaly_start + 2)
|
values[k] = baseline * (1 - ratio) + values[j] * ratio
|
# 确保平滑后的值不低于基准的threshold_ratio
|
values[k] = max(values[k], baseline * threshold_ratio)
|
else:
|
# 如果异常区间到数据末尾,使用基准值的threshold_ratio填充
|
for k in range(anomaly_start, anomaly_end + 1):
|
values[k] = baseline * threshold_ratio
|
else: # 长区间使用更简单的平滑方式,避免插值错误
|
# 使用线性插值来避免非有限值问题
|
if j < len(values):
|
end_val = values[j]
|
# 为每个点创建线性插值
|
for k in range(anomaly_start, anomaly_end + 1):
|
fraction = (k - anomaly_start) / (j - anomaly_start) if j > anomaly_start else 0
|
interpolated = baseline * (1 - fraction) + end_val * fraction
|
values[k] = max(interpolated, baseline * threshold_ratio)
|
else:
|
# 如果异常区间到数据末尾,使用基准值的threshold_ratio填充
|
for k in range(anomaly_start, anomaly_end + 1):
|
values[k] = baseline * threshold_ratio
|
|
i = j # 跳过已处理的异常区间
|
else:
|
i += 1
|
|
# 2. 应用Savitzky-Golay滤波进行整体平滑
|
if len(values) > window_size:
|
# 确保window_size是奇数
|
if window_size % 2 == 0:
|
window_size += 1
|
|
# 应用Savitzky-Golay滤波
|
try:
|
# 对数据进行平滑,但保留原始的特性
|
smoothed = savgol_filter(values, window_size, 3)
|
|
# 确保平滑后的数据不会小于相邻点的threshold_ratio
|
for i in range(1, len(smoothed)):
|
smoothed[i] = max(smoothed[i], smoothed[i-1] * threshold_ratio)
|
|
values = smoothed
|
except Exception as e:
|
print(f"Savitzky-Golay滤波应用失败: {e}")
|
|
filtered_df['Value'] = values
|
return filtered_df
|
|
|
|
|
# 数据加载与预处理函数
|
# -------------------------------
|
def load_data(upstream_file, downstream_file, river_level_file=None, flow_file=None, source_name="青龙港"):
|
"""
|
加载所有相关数据并进行数据质量处理
|
"""
|
try:
|
# 读取上游和下游数据
|
upstream_df = pd.read_csv(upstream_file)
|
downstream_df = pd.read_csv(downstream_file)
|
except FileNotFoundError:
|
print("文件未找到,请检查路径")
|
return None
|
|
# 确保列名一致
|
upstream_df.columns = ['DateTime', 'TagName', 'Value']
|
downstream_df.columns = ['DateTime', 'TagName', 'Value']
|
|
# 转换时间格式并设置为索引
|
upstream_df['DateTime'] = pd.to_datetime(upstream_df['DateTime'])
|
downstream_df['DateTime'] = pd.to_datetime(downstream_df['DateTime'])
|
|
# 设置DateTime为索引
|
upstream_df.set_index('DateTime', inplace=True)
|
downstream_df.set_index('DateTime', inplace=True)
|
|
# 应用盐度数据异常过滤方法s
|
upstream_df = filter_salinity_anomalies(upstream_df, threshold_ratio=0.5, window_size=7, max_days=1)
|
downstream_df = filter_salinity_anomalies(downstream_df, threshold_ratio=0.5, window_size=7, max_days=1)
|
|
# 处理低盐度值(小于5)
|
# 不直接过滤,而是标记为NaN并使用插值方法处理
|
for df in [upstream_df, downstream_df]:
|
# 标记低盐度值为NaN
|
low_salinity_mask = df['Value'] < 5
|
if low_salinity_mask.any():
|
print(f"发现{low_salinity_mask.sum()}个低盐度值(<5),将使用插值处理")
|
df.loc[low_salinity_mask, 'Value'] = np.nan
|
|
# 对短期缺失使用线性插值
|
df['Value'] = df['Value'].interpolate(method='linear', limit=4)
|
|
# 对较长期缺失使用基于时间的插值
|
df['Value'] = df['Value'].interpolate(method='time', limit=24)
|
|
# 对剩余缺失使用前向和后向填充
|
df['Value'] = df['Value'].fillna(method='ffill').fillna(method='bfill')
|
|
# 使用更小的窗口进行平滑处理
|
df['Value'] = df['Value'].rolling(window=6, center=True, min_periods=1).median()
|
|
|
# 重命名Value列
|
upstream_df = upstream_df.rename(columns={'Value': 'upstream'})[['upstream']]
|
downstream_df = downstream_df.rename(columns={'Value': 'downstream'})[['downstream']]
|
|
# 合并数据
|
merged_df = pd.merge(upstream_df, downstream_df, left_index=True, right_index=True, how='inner')
|
|
# 记录数据源名称
|
merged_df['source_name'] = source_name
|
|
# 加载长江水位数据
|
if river_level_file:
|
try:
|
river_level_df = pd.read_csv(river_level_file)
|
print(f"成功读取水位数据文件: {river_level_file}")
|
|
# 确保列名一致
|
if len(river_level_df.columns) >= 3:
|
river_level_df.columns = ['DateTime', 'TagName', 'Value']
|
elif len(river_level_df.columns) == 2:
|
river_level_df.columns = ['DateTime', 'Value']
|
river_level_df['TagName'] = 'water_level'
|
|
# 数据处理
|
river_level_df['DateTime'] = pd.to_datetime(river_level_df['DateTime'])
|
river_level_df.set_index('DateTime', inplace=True)
|
river_level_df['Value'] = pd.to_numeric(river_level_df['Value'], errors='coerce')
|
|
# 使用IQR方法处理异常值
|
Q1 = river_level_df['Value'].quantile(0.25)
|
Q3 = river_level_df['Value'].quantile(0.75)
|
IQR = Q3 - Q1
|
lower_bound = Q1 - 1.5 * IQR
|
upper_bound = Q3 + 1.5 * IQR
|
river_level_df.loc[river_level_df['Value'] < lower_bound, 'Value'] = lower_bound
|
river_level_df.loc[river_level_df['Value'] > upper_bound, 'Value'] = upper_bound
|
|
# 重命名并保留需要的列
|
river_level_df = river_level_df.rename(columns={'Value': 'water_level'})[['water_level']]
|
|
# 合并到主数据框
|
merged_df = pd.merge(merged_df, river_level_df, left_index=True, right_index=True, how='left')
|
|
# 对水位数据进行插值处理
|
merged_df['water_level'] = merged_df['water_level'].interpolate(method='time', limit=24)
|
merged_df['water_level'] = merged_df['water_level'].fillna(method='ffill').fillna(method='bfill')
|
|
# 创建平滑的水位数据
|
merged_df['water_level_smooth'] = merged_df['water_level'].rolling(window=24, min_periods=1, center=True).mean()
|
|
# 添加水位趋势特征
|
merged_df['water_level_trend_1h'] = merged_df['water_level_smooth'].diff(1)
|
merged_df['water_level_trend_24h'] = merged_df['water_level_smooth'].diff(24)
|
|
print(f"水位数据加载成功,范围: {merged_df['water_level'].min()} - {merged_df['water_level'].max()}")
|
except Exception as e:
|
print(f"水位数据加载失败: {str(e)}")
|
|
# 加载大通流量数据
|
if flow_file:
|
try:
|
flow_df = pd.read_csv(flow_file)
|
print(f"成功读取流量数据文件: {flow_file}")
|
|
# 确保列名一致
|
if len(flow_df.columns) >= 3:
|
flow_df.columns = ['DateTime', 'TagName', 'Value']
|
elif len(flow_df.columns) == 2:
|
flow_df.columns = ['DateTime', 'Value']
|
flow_df['TagName'] = 'flow'
|
|
# 数据处理
|
flow_df['DateTime'] = pd.to_datetime(flow_df['DateTime'])
|
flow_df.set_index('DateTime', inplace=True)
|
flow_df['Value'] = pd.to_numeric(flow_df['Value'], errors='coerce')
|
|
# 使用IQR方法处理异常值
|
Q1 = flow_df['Value'].quantile(0.25)
|
Q3 = flow_df['Value'].quantile(0.75)
|
IQR = Q3 - Q1
|
lower_bound = Q1 - 1.5 * IQR
|
upper_bound = Q3 + 1.5 * IQR
|
flow_df.loc[flow_df['Value'] < lower_bound, 'Value'] = lower_bound
|
flow_df.loc[flow_df['Value'] > upper_bound, 'Value'] = upper_bound
|
|
# 重命名并保留需要的列
|
flow_df = flow_df.rename(columns={'Value': 'flow'})[['flow']]
|
|
# 合并到主数据框
|
merged_df = pd.merge(merged_df, flow_df, left_index=True, right_index=True, how='left')
|
|
# 对流量数据进行插值处理
|
merged_df['flow'] = merged_df['flow'].interpolate(method='time', limit=24)
|
merged_df['flow'] = merged_df['flow'].fillna(method='ffill').fillna(method='bfill')
|
|
# 创建平滑的流量数据
|
merged_df['flow_smooth'] = merged_df['flow'].rolling(window=24, min_periods=1, center=True).mean()
|
|
# 添加流量趋势特征
|
merged_df['flow_trend_1h'] = merged_df['flow_smooth'].diff(1)
|
merged_df['flow_trend_24h'] = merged_df['flow_smooth'].diff(24)
|
|
# 添加流量统计特征
|
merged_df['mean_1d_flow'] = merged_df['flow_smooth'].rolling(window=24, min_periods=1).mean()
|
merged_df['mean_3d_flow'] = merged_df['flow_smooth'].rolling(window=72, min_periods=1).mean()
|
merged_df['std_1d_flow'] = merged_df['flow_smooth'].rolling(window=24, min_periods=1).std()
|
|
# 添加流量变化特征
|
merged_df['flow_change_1h'] = merged_df['flow_smooth'].diff(1)
|
merged_df['flow_change_24h'] = merged_df['flow_smooth'].diff(24)
|
|
print(f"流量数据加载成功,范围: {merged_df['flow'].min()} - {merged_df['flow'].max()} m³/s")
|
except Exception as e:
|
print(f"流量数据加载失败: {str(e)}")
|
|
# 对盐度数据进行插值和平滑处理
|
merged_df['upstream'] = merged_df['upstream'].interpolate(method='time', limit=24)
|
merged_df['downstream'] = merged_df['downstream'].interpolate(method='time', limit=24)
|
|
# 使用前向后向填充处理剩余的NaN值
|
merged_df['upstream'] = merged_df['upstream'].ffill().bfill()
|
merged_df['downstream'] = merged_df['downstream'].ffill().bfill()
|
|
# 创建平滑的盐度数据
|
merged_df['upstream_smooth'] = merged_df['upstream'].rolling(window=24, min_periods=1, center=True).mean()
|
merged_df['downstream_smooth'] = merged_df['downstream'].rolling(window=24, min_periods=1, center=True).mean()
|
|
# 添加趋势特征
|
merged_df['upstream_trend_1h'] = merged_df['upstream_smooth'].diff(1)
|
merged_df['upstream_trend_24h'] = merged_df['upstream_smooth'].diff(24)
|
merged_df['downstream_trend_1h'] = merged_df['downstream_smooth'].diff(1)
|
merged_df['downstream_trend_24h'] = merged_df['downstream_smooth'].diff(24)
|
|
# 填充NaN值
|
merged_df['upstream_trend_1h'] = merged_df['upstream_trend_1h'].fillna(0)
|
merged_df['upstream_trend_24h'] = merged_df['upstream_trend_24h'].fillna(0)
|
merged_df['downstream_trend_1h'] = merged_df['downstream_trend_1h'].fillna(0)
|
merged_df['downstream_trend_24h'] = merged_df['downstream_trend_24h'].fillna(0)
|
|
# 对低盐度部分使用更大的窗口进行平滑
|
low_sal_mask = merged_df['upstream'] < 50
|
if low_sal_mask.any():
|
merged_df.loc[low_sal_mask, 'upstream_smooth'] = merged_df.loc[low_sal_mask, 'upstream']\
|
.rolling(window=48, min_periods=1, center=True).mean()
|
|
# 数据验证和统计
|
print("\n数据质量统计:")
|
print(f"总数据量: {len(merged_df)}")
|
print(f"上游({source_name})盐度范围: {merged_df['upstream_smooth'].min():.2f} - {merged_df['upstream_smooth'].max():.2f}")
|
print(f"下游(陈行)盐度范围: {merged_df['downstream_smooth'].min():.2f} - {merged_df['downstream_smooth'].max():.2f}")
|
|
if 'water_level' in merged_df.columns:
|
print(f"水位范围: {merged_df['water_level_smooth'].min():.2f} - {merged_df['water_level_smooth'].max():.2f}")
|
print(f"水位缺失比例: {merged_df['water_level_smooth'].isna().mean()*100:.2f}%")
|
|
if 'flow' in merged_df.columns:
|
print(f"流量范围: {merged_df['flow_smooth'].min():.2f} - {merged_df['flow_smooth'].max():.2f} m³/s")
|
print(f"流量缺失比例: {merged_df['flow_smooth'].isna().mean()*100:.2f}%")
|
|
# 重置索引,将DateTime作为列
|
merged_df = merged_df.reset_index()
|
|
return merged_df
|
|
def resample_to_hourly(df):
|
"""
|
将分钟级数据重采样为小时级数据,计算每小时的平均值
|
"""
|
try:
|
# 确保DateTime是索引
|
if 'DateTime' in df.columns:
|
df = df.set_index('DateTime')
|
|
# 获取所有数值列
|
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
# 按小时重采样,计算平均值
|
hourly_df = df[numeric_columns].resample('H').mean()
|
|
# 重置索引,将DateTime作为列
|
hourly_df = hourly_df.reset_index()
|
|
print(f"数据已从分钟级重采样为小时级,原始数据行数: {len(df)},重采样后行数: {len(hourly_df)}")
|
return hourly_df
|
|
except Exception as e:
|
print(f"重采样数据异常: {e}")
|
return df
|
|
|
|
|
# # 测试
|
# df = load_data('yuce_data/青龙港1.csv', 'yuce_data/一取水.csv')
|
# # 将数据重采样为小时级
|
# df = resample_to_hourly(df)
|
# df.to_csv('merged_data_hour.csv', index=False)
|
# print(f"Merged data saved to 'merged_data_hour.csv' successfully")
|
|
# # 绘制盐度随时间变化图
|
# plt.figure(figsize=(12, 6))
|
# plt.plot(df['DateTime'], df['upstream_smooth'], label='上游盐度', color='blue')
|
# plt.plot(df['DateTime'], df['downstream_smooth'], label='下游盐度', color='red')
|
# plt.axhline(y=250, color='red', linestyle='--', alpha=0.7, linewidth=1.5, label='盐度警戒线 (250)')
|
# plt.xlabel('时间')
|
# plt.ylabel('盐度')
|
# plt.title('盐度随时间变化图')
|
# plt.legend()
|
# plt.grid(True)
|
# plt.tight_layout()
|
# plt.savefig('salinity_time_series.png', dpi=300)
|
# plt.show()
|
|
# -------------------------------
|
# 添加农历(潮汐)特征
|
# -------------------------------
|
def add_lunar_features(df):
|
lunar_day, lunar_phase_sin, lunar_phase_cos, is_high_tide = [], [], [], []
|
for dt in df['DateTime']:
|
ld = LunarDate.fromSolarDate(dt.year, dt.month, dt.day)
|
lunar_day.append(ld.day)
|
lunar_phase_sin.append(np.sin(2 * np.pi * ld.day / 15))
|
lunar_phase_cos.append(np.cos(2 * np.pi * ld.day / 15))
|
is_high_tide.append(1 if (ld.day <= 5 or (ld.day >= 16 and ld.day <= 20)) else 0)
|
df['lunar_day'] = lunar_day
|
df['lunar_phase_sin'] = lunar_phase_sin
|
df['lunar_phase_cos'] = lunar_phase_cos
|
df['is_high_tide'] = is_high_tide
|
return df
|
|
|
# -------------------------------
|
# 生成延迟特征(向量化)
|
# -------------------------------
|
def batch_create_delay_features(df, delay_hours):
|
"""
|
为数据框中的特定列创建延迟特征
|
"""
|
# 定义需要创建延迟特征的列
|
target_columns = ['upstream_smooth']
|
|
|
# target_columns = ['upstream_smooth', 'downstream_smooth']
|
# # 如果存在水位数据列,也为它创建延迟特征 暂时不使用
|
# if 'water_level_smooth' in df.columns:
|
# target_columns.append('water_level_smooth')
|
# elif 'water_level' in df.columns:
|
# print("注意: 水位平滑列不存在,使用原始水位列创建延迟特征")
|
# # 创建水位平滑列
|
# df['water_level_smooth'] = df['water_level'].rolling(window=24, min_periods=1, center=True).mean()
|
# df['water_level_smooth'] = df['water_level_smooth'].fillna(df['water_level'])
|
# target_columns.append('water_level_smooth')
|
|
# 创建延迟特征
|
for column in target_columns:
|
if column in df.columns:
|
for delay in delay_hours:
|
df[f'{column.split("_")[0]}_delay_{delay}h'] = df[column].shift(delay)
|
else:
|
print(f"警告: 列 {column} 不存在,跳过创建延迟特征")
|
|
return df
|
|
|
# 生成其他特征
|
def generate_features(df):
|
"""
|
生成其他特征,包括历史数据、时间特征、统计特征和外部特征,并将这些特征添加到原始DataFrame中
|
"""
|
try:
|
# 创建平滑的盐度数据
|
df['upstream_smooth'] = df['upstream'].rolling(window=24, min_periods=1, center=True).mean()
|
df['downstream_smooth'] = df['downstream'].rolling(window=24, min_periods=1, center=True).mean()
|
|
# 时间特征
|
df['hour'] = df['DateTime'].dt.hour
|
df['weekday'] = df['DateTime'].dt.dayofweek
|
df['month'] = df['DateTime'].dt.month
|
|
# 时间特征的sin和cos转换
|
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
|
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
|
df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
|
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)
|
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
|
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
|
|
# 统计特征
|
df['mean_1d_up'] = df['upstream_smooth'].rolling(window=24, min_periods=1).mean()
|
df['mean_3d_up'] = df['upstream_smooth'].rolling(window=72, min_periods=1).mean()
|
df['std_1d_up'] = df['upstream_smooth'].rolling(window=24, min_periods=1).std()
|
|
df['mean_1d_down'] = df['downstream_smooth'].rolling(window=24, min_periods=1).mean()
|
df['mean_3d_down'] = df['downstream_smooth'].rolling(window=72, min_periods=1).mean()
|
df['std_1d_down'] = df['downstream_smooth'].rolling(window=24, min_periods=1).std()
|
|
# 趋势特征
|
df['trend_1h_up'] = df['upstream_smooth'].diff(1)
|
df['trend_3h_up'] = df['upstream_smooth'].diff(3)
|
df['trend_6h_up'] = df['upstream_smooth'].diff(6)
|
df['trend_12h_up'] = df['upstream_smooth'].diff(12)
|
df['trend_24h_up'] = df['upstream_smooth'].diff(24)
|
|
df['trend_1h_down'] = df['downstream_smooth'].diff(1)
|
df['trend_3h_down'] = df['downstream_smooth'].diff(3)
|
df['trend_6h_down'] = df['downstream_smooth'].diff(6)
|
df['trend_12h_down'] = df['downstream_smooth'].diff(12)
|
df['trend_24h_down'] = df['downstream_smooth'].diff(24)
|
|
# 外部特征(水位和流量)
|
if 'water_level_smooth' in df.columns:
|
df['water_level_trend_1h'] = df['water_level_smooth'].diff(1)
|
df['water_level_trend_24h'] = df['water_level_smooth'].diff(24)
|
df['mean_1d_water_level'] = df['water_level_smooth'].rolling(window=24, min_periods=1).mean()
|
df['mean_3d_water_level'] = df['water_level_smooth'].rolling(window=72, min_periods=1).mean()
|
df['std_1d_water_level'] = df['water_level_smooth'].rolling(window=24, min_periods=1).std()
|
|
if 'flow_smooth' in df.columns:
|
df['flow_trend_1h'] = df['flow_smooth'].diff(1)
|
df['flow_trend_24h'] = df['flow_smooth'].diff(24)
|
df['mean_1d_flow'] = df['flow_smooth'].rolling(window=24, min_periods=1).mean()
|
df['mean_3d_flow'] = df['flow_smooth'].rolling(window=72, min_periods=1).mean()
|
df['std_1d_flow'] = df['flow_smooth'].rolling(window=24, min_periods=1).std()
|
|
return df
|
|
except Exception as e:
|
print(f"特征生成异常: {e}")
|
return df
|
|
|
|
# -------------------------------
|
# 主程序入口:加载数据、添加特征、生成延迟特征后启动GUI
|
# -------------------------------
|
def save_processed_data(df, filename='processed_data.pkl'):
|
try:
|
df.to_pickle(filename)
|
print(f"已保存处理后的数据到 {filename}")
|
return True
|
except Exception as e:
|
print(f"保存数据失败: {e}")
|
return False
|
|
def load_processed_data(filename='processed_data.pkl'):
|
try:
|
if os.path.exists(filename):
|
df = pd.read_pickle(filename)
|
print(f"已从 {filename} 加载处理后的数据")
|
return df
|
else:
|
print(f"找不到处理后的数据文件 {filename}")
|
return None
|
except Exception as e:
|
print(f"加载数据失败: {e}")
|
return None
|
|
def load_both_datasets():
|
"""加载两个数据源的数据集"""
|
global cached_model, last_training_time
|
|
# 删除旧的处理数据和模型文件,以应用修复后的代码
|
for file in ['processed_data_qinglong.pkl', 'processed_data_taicang.pkl',
|
'salinity_model_qinglong.pkl', 'salinity_model_taicang.pkl']:
|
if os.path.exists(file):
|
try:
|
os.remove(file)
|
print(f"已删除旧的文件: {file}")
|
except Exception as e:
|
print(f"删除文件失败: {file} - {e}")
|
|
# 加载青龙港-陈行数据集
|
qinglong_df = load_processed_data('processed_data_qinglong.pkl')
|
if qinglong_df is None:
|
# 创建新的数据集
|
print("正在处理青龙港-陈行数据集...")
|
qinglong_df = load_data('yuce_data\青龙港盐度1.csv', 'yuce_data\陈行第一取水口盐度.csv',
|
'yuce_data\长江液位.csv', 'yuce_data\大通流量.csv', source_name="青龙港")
|
if qinglong_df is not None:
|
# 添加时间特征
|
qinglong_df['hour'] = qinglong_df['DateTime'].dt.hour
|
qinglong_df['weekday'] = qinglong_df['DateTime'].dt.dayofweek
|
qinglong_df['month'] = qinglong_df['DateTime'].dt.month
|
|
# 添加农历特征
|
qinglong_df = add_lunar_features(qinglong_df)
|
|
# 添加延迟特征 (青龙港-陈行: 3-7天)
|
delay_hours = [36,42,48,54,60,72,78,84,90,96,102,108,114,120,126,132,138,144,150,156,162,168]
|
qinglong_df = batch_create_delay_features(qinglong_df, delay_hours)
|
|
# 添加统计特征
|
qinglong_df = generate_features(qinglong_df)
|
|
# 将数据重采样为小时级
|
qinglong_df = resample_to_hourly(qinglong_df)
|
|
# 保存处理后的数据
|
save_processed_data(qinglong_df, 'processed_data_qinglong.pkl')
|
print("青龙港-陈行数据集处理完成")
|
else:
|
print("已从缓存加载青龙港-陈行数据集")
|
|
# 加载太仓石化-陈行数据集
|
taicang_df = load_processed_data('processed_data_taicang.pkl')
|
if taicang_df is None:
|
# 创建新的数据集
|
print("正在处理太仓石化-陈行数据集...")
|
taicang_df = load_data('yuce_data\太仓石化盐度2.csv', 'yuce_data\陈行第一取水口盐度.csv',
|
'yuce_data\长江液位.csv', 'yuce_data\大通流量.csv', source_name="太仓石化")
|
if taicang_df is not None:
|
# 添加时间特征
|
taicang_df['hour'] = taicang_df['DateTime'].dt.hour
|
taicang_df['weekday'] = taicang_df['DateTime'].dt.dayofweek
|
taicang_df['month'] = taicang_df['DateTime'].dt.month
|
|
# 添加农历特征
|
taicang_df = add_lunar_features(taicang_df)
|
|
# 添加延迟特征 (太仓石化-陈行: 1-3天)
|
delay_hours = [1,6,12,18,24,30,36,42,48,54,60,66,72]
|
taicang_df = batch_create_delay_features(taicang_df, delay_hours)
|
|
# 添加统计特征
|
taicang_df = generate_features(taicang_df)
|
|
# 将数据重采样为小时级
|
taicang_df = resample_to_hourly(taicang_df)
|
|
# 保存处理后的数据
|
save_processed_data(taicang_df, 'processed_data_taicang.pkl')
|
print("太仓石化-陈行数据集处理完成")
|
else:
|
print("已从缓存加载太仓石化-陈行数据集")
|
|
return qinglong_df, taicang_df
|
|
|
# -------------------------------
|
# 模型训练与预测,展示验证准确度(RMSE, MAE)
|
# -------------------------------
|
def train_and_predict(df, start_time, force_retrain=False):
|
global cached_model, last_training_time, prediction_mode
|
|
# 根据当前预测模式选择模型缓存文件
|
if prediction_mode == "青龙港-陈行":
|
model_cache_file = 'salinity_model_qinglong.pkl'
|
# 青龙港-陈行模式使用14天(336小时)回溯窗口
|
look_back_hours = 336
|
else: # 太仓石化-陈行
|
model_cache_file = 'salinity_model_taicang.pkl'
|
# 太仓石化-陈行模式使用7天(168小时)回溯窗口
|
look_back_hours = 168
|
|
model_needs_training = True
|
|
if os.path.exists(model_cache_file) and force_retrain:
|
try:
|
os.remove(model_cache_file)
|
print("已删除旧模型缓存(强制重新训练)")
|
except Exception as e:
|
print("删除缓存异常:", e)
|
|
train_df = df[df['DateTime'] < start_time].copy()
|
|
# 创建测试特征,检查当前特征维度
|
test_X, test_y = create_features_vectorized(train_df, look_back=look_back_hours, forecast_horizon=24)
|
if test_X is None or test_y is None:
|
print("特征生成失败")
|
return None, None, None, None
|
|
current_feature_dim = test_X.shape[1] if len(test_X) > 0 else 0
|
print(f"当前特征维度: {current_feature_dim}")
|
|
cached_feature_dim = None
|
|
if not force_retrain and cached_model is not None and last_training_time is not None:
|
if last_training_time >= train_df['DateTime'].max():
|
try:
|
cached_feature_dim = cached_model.n_features_in_
|
print(f"缓存模型特征维度: {cached_feature_dim}")
|
|
if cached_feature_dim == current_feature_dim:
|
model_needs_training = False
|
print(f"使用缓存模型,训练时间: {last_training_time}")
|
else:
|
print(f"特征维度不匹配(缓存模型: {cached_feature_dim},当前: {current_feature_dim}),需要重新训练")
|
except Exception as e:
|
print(f"检查模型特征维度失败: {e}")
|
elif not force_retrain and os.path.exists(model_cache_file):
|
try:
|
with open(model_cache_file, 'rb') as f:
|
model_data = pickle.load(f)
|
cached_model = model_data['model']
|
last_training_time = model_data['training_time']
|
|
try:
|
cached_feature_dim = cached_model.n_features_in_
|
print(f"文件缓存模型特征维度: {cached_feature_dim}")
|
if cached_feature_dim == current_feature_dim:
|
if last_training_time >= train_df['DateTime'].max():
|
model_needs_training = False
|
print(f"从文件加载模型,训练时间: {last_training_time}")
|
else:
|
print(f"特征维度不匹配(文件模型: {cached_feature_dim},当前: {current_feature_dim}),需要重新训练")
|
except Exception as e:
|
print(f"检查模型特征维度失败: {e}")
|
except Exception as e:
|
print("加载模型失败:", e)
|
|
if model_needs_training:
|
print(f"开始训练新{prediction_mode}模型集成...")
|
if len(train_df) < 100:
|
print("训练数据不足")
|
return None, None, None, None
|
|
start_train = time()
|
X, y = create_features_vectorized(train_df, look_back=look_back_hours, forecast_horizon=24)
|
if X is None or y is None:
|
print("特征生成失败")
|
return None, None, None, None
|
|
if len(X) == 0 or len(y) == 0:
|
print("样本生成不足,训练终止")
|
return None, None, None, None
|
|
print(f"训练样本数量: {X.shape[0]}, 特征维度: {X.shape[1]}")
|
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
# 特征标准化,提高模型性能
|
scaler = StandardScaler()
|
X_train_scaled = scaler.fit_transform(X_train)
|
X_val_scaled = scaler.transform(X_val)
|
|
# 训练一组不同的模型
|
models = {}
|
|
# XGBoost模型
|
if prediction_mode == "青龙港-陈行":
|
# 青龙港-陈行参数优化
|
models['xgb'] = XGBRegressor(
|
n_estimators=300,
|
learning_rate=0.05,
|
max_depth=7,
|
min_child_weight=3,
|
subsample=0.8,
|
colsample_bytree=0.8,
|
gamma=0.2,
|
reg_alpha=0.2,
|
reg_lambda=1.0,
|
n_jobs=-1,
|
random_state=42,
|
early_stopping_rounds=15
|
)
|
else:
|
# 太仓石化-陈行参数优化
|
models['xgb'] = XGBRegressor(
|
n_estimators=250,
|
learning_rate=0.08,
|
max_depth=5,
|
min_child_weight=2,
|
subsample=0.85,
|
colsample_bytree=0.75,
|
gamma=0.1,
|
reg_alpha=0.1,
|
reg_lambda=1.5,
|
n_jobs=-1,
|
random_state=42,
|
early_stopping_rounds=12
|
)
|
|
# 随机森林回归器
|
models['rf'] = RandomForestRegressor(
|
n_estimators=200,
|
max_depth=12,
|
min_samples_split=5,
|
min_samples_leaf=2,
|
n_jobs=-1,
|
random_state=42
|
)
|
|
# 添加Ridge和Lasso回归
|
models['ridge'] = Ridge(alpha=1.0, random_state=42)
|
models['lasso'] = Lasso(alpha=0.1, random_state=42)
|
|
# 添加支持向量回归
|
models['svr'] = SVR(kernel='rbf', C=10, gamma='scale')
|
|
# 神经网络回归器
|
models['mlp'] = MLPRegressor(
|
hidden_layer_sizes=(100, 50),
|
activation='relu',
|
solver='adam',
|
alpha=0.0001,
|
batch_size='auto',
|
max_iter=500,
|
early_stopping=True,
|
random_state=42
|
)
|
|
# 训练所有模型并评估性能
|
model_performances = {}
|
model_predictions = {}
|
best_model = None
|
best_rmse = float('inf')
|
|
for name, model in models.items():
|
try:
|
print(f"训练 {name} 模型...")
|
if name == 'xgb':
|
model.fit(X_train_scaled, y_train,
|
eval_set=[(X_val_scaled, y_val)],
|
eval_metric='rmse',
|
verbose=True)
|
else:
|
model.fit(X_train_scaled, y_train)
|
|
# 在验证集上评估
|
y_val_pred = model.predict(X_val_scaled)
|
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
|
mae = mean_absolute_error(y_val, y_val_pred)
|
|
model_performances[name] = {'rmse': rmse, 'mae': mae}
|
model_predictions[name] = y_val_pred
|
|
print(f"{name} 模型 - 验证集 RMSE: {rmse:.4f}, MAE: {mae:.4f}")
|
|
# 记录最佳单个模型
|
if rmse < best_rmse:
|
best_rmse = rmse
|
best_model = name
|
|
except Exception as e:
|
print(f"{name} 模型训练失败: {e}")
|
|
# 显示所有模型性能
|
print("\n模型性能比较:")
|
for name, metrics in model_performances.items():
|
print(f"{name}: RMSE = {metrics['rmse']:.4f}, MAE = {metrics['mae']:.4f}")
|
|
if best_model:
|
print(f"\n最佳单个模型: {best_model}, RMSE = {model_performances[best_model]['rmse']:.4f}")
|
|
# 创建加权集成模型
|
print("\n创建加权集成模型...")
|
model_weights = {}
|
valid_models = 0
|
|
# 基于验证集性能分配权重
|
for name, metrics in model_performances.items():
|
# 反比于RMSE的权重分配(较低的RMSE获得较高的权重)
|
weight = 1.0 / (metrics['rmse'] + 1e-10) # 添加小值避免除零
|
model_weights[name] = weight
|
valid_models += 1
|
|
# 归一化权重
|
total_weight = sum(model_weights.values())
|
for name in model_weights:
|
model_weights[name] /= total_weight
|
print(f"模型 {name} 权重: {model_weights[name]:.4f}")
|
|
# 计算加权集成预测
|
ensemble_pred = np.zeros_like(y_val)
|
for name, pred in model_predictions.items():
|
if name in model_weights:
|
ensemble_pred += pred * model_weights[name]
|
|
# 评估集成模型
|
ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))
|
ensemble_mae = mean_absolute_error(y_val, ensemble_pred)
|
print(f"集成模型性能: RMSE = {ensemble_rmse:.4f}, MAE = {ensemble_mae:.4f}")
|
|
# 特征重要性分析(仅适用于XGBoost和RandomForest)
|
feature_importance = None
|
if 'xgb' in models:
|
feature_importance = models['xgb'].feature_importances_
|
sorted_idx = np.argsort(feature_importance)[::-1]
|
|
# 动态生成特征名称,确保与特征数量匹配
|
feature_names = []
|
|
# 获取数值列
|
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
if 'DateTime' in numeric_columns:
|
numeric_columns.remove('DateTime')
|
|
# 为所有特征创建名称(根据create_features_vectorized的输出特征)
|
# 提取关键列的原始数据特征名称
|
key_columns = ['upstream_smooth', 'downstream_smooth']
|
if 'upstream_trend_24h' in numeric_columns:
|
key_columns.extend(['upstream_trend_24h', 'downstream_trend_24h'])
|
if 'water_level_smooth' in numeric_columns:
|
key_columns.append('water_level_smooth')
|
if 'flow_smooth' in numeric_columns:
|
key_columns.append('flow_smooth')
|
|
# 添加每个时间点的原始特征名称
|
for col in key_columns:
|
for i in range(look_back_hours):
|
feature_names.append(f'{col}_t{i}')
|
|
# 添加时间特征和统计特征名称
|
feature_names.extend([
|
'month_sin', 'month_cos', 'day_sin', 'day_cos',
|
'hour_sin', 'hour_cos', 'yearday_sin', 'yearday_cos',
|
'yearday_linear', 'weekday'
|
])
|
|
for col in ['upstream_smooth', 'downstream_smooth']:
|
feature_names.extend([
|
f'{col}_mean', f'{col}_std', f'{col}_max', f'{col}_min',
|
f'{col}_last', f'{col}_mean_24h', f'{col}_diff_1h', f'{col}_diff_24h'
|
])
|
|
feature_names.extend(['lunar_phase_sin', 'lunar_phase_cos', 'is_high_tide'])
|
|
# 确保特征名称数量与重要性数组长度匹配
|
if len(feature_names) != len(feature_importance):
|
print(f"注意: 特征名称数量({len(feature_names)})与重要性数组长度({len(feature_importance)})不匹配")
|
print("将使用自动生成的特征名称")
|
# 重新生成匹配长度的特征名称
|
feature_names = [f'feature_{i}' for i in range(len(feature_importance))]
|
|
# 打印前15个重要特征
|
print(f"\n{prediction_mode}模型 Top 15 重要特征:")
|
for i in range(min(15, len(sorted_idx))):
|
print(f"{i+1}. {feature_names[sorted_idx[i]]}: {feature_importance[sorted_idx[i]]:.6f}")
|
|
# 保存所有模型,权重和预处理器
|
cached_models = models
|
cached_model_weights = model_weights
|
last_training_time = start_time
|
|
with open(model_cache_file, 'wb') as f:
|
pickle.dump({
|
'models': models,
|
'model_weights': model_weights,
|
'best_model': best_model,
|
'scaler': scaler,
|
'training_time': last_training_time,
|
'feature_names': feature_names if 'xgb' in models else None,
|
'ensemble_rmse': ensemble_rmse,
|
'ensemble_mae': ensemble_mae,
|
'model_performances': model_performances,
|
'look_back_hours': look_back_hours,
|
'feature_dim': current_feature_dim
|
}, f)
|
print(f"{prediction_mode}模型训练完成,耗时: {time() - start_train:.2f}秒,特征维度: {current_feature_dim}")
|
else:
|
# 从缓存加载模型
|
try:
|
with open(model_cache_file, 'rb') as f:
|
model_data = pickle.load(f)
|
cached_models = model_data.get('models', {})
|
cached_model_weights = model_data.get('model_weights', {})
|
scaler = model_data.get('scaler', None)
|
look_back_hours = model_data.get('look_back_hours', look_back_hours)
|
best_model = model_data.get('best_model', None)
|
print(f"从缓存加载了{len(cached_models)}个模型,权重分布: {cached_model_weights}")
|
except Exception as e:
|
print(f"加载模型失败: {e}")
|
return None, None, None, None
|
|
# 预测部分
|
try:
|
# 初始化存储预测结果的列表
|
future_dates = [start_time + timedelta(days=i) for i in range(5)] # 预测5天
|
predictions = np.zeros(5)
|
|
# 创建预测所需的特征矩阵
|
X_pred = []
|
for i in range(5):
|
current_date = future_dates[i]
|
features = generate_prediction_features(df, current_date, look_back=look_back_hours)
|
if features is None:
|
print(f"生成预测特征失败: {current_date}")
|
return None, None, None, None
|
X_pred.append(features)
|
|
# 对预测特征进行标准化
|
X_pred = np.array(X_pred)
|
if scaler is not None:
|
try:
|
X_pred_scaled = scaler.transform(X_pred)
|
except:
|
print("特征标准化失败,使用原始特征")
|
X_pred_scaled = X_pred
|
else:
|
X_pred_scaled = X_pred
|
|
# 集成预测
|
print("使用模型集成进行预测...")
|
predictions = np.zeros(len(future_dates))
|
|
# 使用缓存的模型和权重进行预测
|
if model_needs_training:
|
cached_models = models
|
cached_model_weights = model_weights
|
|
individual_predictions = {}
|
for name, model in cached_models.items():
|
try:
|
model_pred = model.predict(X_pred_scaled)
|
individual_predictions[name] = model_pred
|
weight = cached_model_weights.get(name, 0)
|
predictions += model_pred * weight
|
print(f"模型 {name} 权重: {weight:.4f}, 预测值: {model_pred}")
|
except Exception as e:
|
print(f"模型 {name} 预测失败: {e}")
|
|
# 打印各个模型的预测值
|
for date_idx, date in enumerate(future_dates):
|
print(f"\n{date.strftime('%Y-%m-%d')} 的预测:")
|
for name, preds in individual_predictions.items():
|
print(f" {name}: {preds[date_idx]:.2f}")
|
print(f" 集成: {predictions[date_idx]:.2f}")
|
|
# 计算预测的置信区间
|
if model_needs_training:
|
# 使用集成RMSE作为误差估计
|
train_std = ensemble_rmse
|
else:
|
# 使用模型缓存中的RMSE作为误差估计
|
try:
|
with open(model_cache_file, 'rb') as f:
|
model_data = pickle.load(f)
|
train_std = model_data.get('ensemble_rmse', 1.0)
|
except:
|
train_std = 1.0
|
|
# 设置90%置信区间
|
prediction_intervals = np.array([
|
predictions - 1.645 * train_std,
|
predictions + 1.645 * train_std
|
])
|
|
return future_dates, predictions, cached_models, prediction_intervals
|
except Exception as e:
|
print("预测过程异常:", e)
|
return None, None, None, None
|
|
# -------------------------------
|
# 获取模型准确度指标
|
# -------------------------------
|
def get_model_metrics():
|
"""获取保存在模型缓存中的准确度指标"""
|
global prediction_mode
|
|
# 根据当前预测模式选择模型缓存文件
|
if prediction_mode == "青龙港-陈行":
|
model_cache_file = 'salinity_model_qinglong.pkl'
|
else: # 太仓石化-陈行
|
model_cache_file = 'salinity_model_taicang.pkl'
|
|
if os.path.exists(model_cache_file):
|
try:
|
with open(model_cache_file, 'rb') as f:
|
model_data = pickle.load(f)
|
return {
|
'rmse': model_data.get('rmse', None),
|
'mae': model_data.get('mae', None)
|
}
|
except Exception as e:
|
print(f"获取模型指标失败: {e}")
|
return None
|
|
def run_gui():
|
"""运行GUI界面"""
|
global qinglong_df, taicang_df
|
def configure_gui_fonts():
|
font_names = ['微软雅黑', 'Microsoft YaHei', 'SimSun', 'SimHei']
|
for font_name in font_names:
|
try:
|
default_font = tkfont.nametofont("TkDefaultFont")
|
default_font.configure(family=font_name)
|
text_font = tkfont.nametofont("TkTextFont")
|
text_font.configure(family=font_name)
|
fixed_font = tkfont.nametofont("TkFixedFont")
|
fixed_font.configure(family=font_name)
|
return True
|
except Exception as e:
|
continue
|
return False
|
|
def switch_prediction_mode():
|
global prediction_mode, current_df, df, cached_model, last_training_time
|
|
# 切换预测模式
|
if prediction_mode == "青龙港-陈行":
|
prediction_mode = "太仓石化-陈行"
|
current_df = taicang_df
|
switch_button.config(text="切换到青龙港-陈行")
|
else:
|
prediction_mode = "青龙港-陈行"
|
current_df = qinglong_df
|
switch_button.config(text="切换到太仓石化-陈行")
|
|
# 更新当前数据集
|
df = current_df
|
|
# 重置模型缓存
|
cached_model = None
|
last_training_time = None
|
|
# 更新标题
|
root.title(f"{prediction_mode}盐度预测系统")
|
|
# 更新界面信息
|
status_label.config(text=f"已切换到{prediction_mode}模式")
|
|
# 更新模型指标
|
model_metrics = get_model_metrics()
|
metrics_text = "模型准确度: 未知" if not model_metrics else f"模型准确度 - RMSE: {model_metrics['rmse']:.4f}, MAE: {model_metrics['mae']:.4f}"
|
metrics_label.config(text=metrics_text)
|
|
# 显示历史数据
|
display_history_data()
|
|
def on_predict():
|
try:
|
predict_start = time()
|
status_label.config(text="预测中...")
|
root.update()
|
start_time_dt = pd.to_datetime(entry.get())
|
force_retrain = retrain_var.get()
|
future_dates, predictions, model, prediction_intervals = train_and_predict(df, start_time_dt, force_retrain)
|
if future_dates is None or predictions is None:
|
status_label.config(text="预测失败")
|
return
|
|
# 获取并显示模型准确度指标
|
model_metrics = get_model_metrics()
|
if model_metrics:
|
metrics_text = f"模型准确度 - RMSE: {model_metrics['rmse']:.4f}, MAE: {model_metrics['mae']:.4f}"
|
metrics_label.config(text=metrics_text)
|
|
# 清除图形并重新绘制
|
ax.clear()
|
|
# 绘制历史数据(预测时间点之前的所有数据)
|
history_end = min(start_time_dt, df['DateTime'].max())
|
history_start = df['DateTime'].min() # 使用所有可用的历史数据
|
hist_data = df[(df['DateTime'] >= history_start) & (df['DateTime'] <= history_end)]
|
|
# 确保数据不为空
|
if len(hist_data) == 0:
|
status_label.config(text="错误: 所选时间范围内没有历史数据")
|
return
|
|
# 检查source_name列是否存在,如果不存在则使用默认值
|
if 'source_name' in hist_data.columns:
|
source = hist_data["source_name"].iloc[0]
|
else:
|
# 根据当前预测模式判断上游名称
|
source = "青龙港" if prediction_mode == "青龙港-陈行" else "太仓石化"
|
|
# 绘制基本数据
|
ax.plot(hist_data['DateTime'], hist_data['downstream_smooth'],
|
label='陈行(下游)盐度', color='blue', linewidth=1.5)
|
ax.plot(hist_data['DateTime'], hist_data['upstream_smooth'],
|
label=f'{source}(上游)盐度', color='purple', linewidth=1.5, alpha=0.7)
|
|
# 添加盐度250的标注线
|
ax.axhline(y=250, color='red', linestyle='--', alpha=0.7, linewidth=1.5, label='盐度警戒线 (250)')
|
|
# 绘制预测数据
|
if len(future_dates) > 0 and len(predictions) > 0:
|
ax.plot(future_dates, predictions, marker='o', linestyle='--',
|
label='递归预测盐度', color='red', linewidth=2)
|
|
# 添加预测的置信区间
|
if prediction_intervals is not None:
|
ax.fill_between(future_dates, prediction_intervals[0], prediction_intervals[1],
|
color='red', alpha=0.2, label='95% 置信区间')
|
|
# 绘制实际数据(如果有)
|
actual_data = df[(df['DateTime'] >= start_time_dt) & (df['DateTime'] <= future_dates[-1])]
|
actual_values = None
|
|
if not actual_data.empty:
|
actual_values = []
|
# 获取与预测日期最接近的实际数据
|
for pred_date in future_dates:
|
closest_idx = np.argmin(np.abs(actual_data['DateTime'] - pred_date))
|
actual_values.append(actual_data['downstream_smooth'].iloc[closest_idx])
|
|
# 绘制实际盐度曲线
|
ax.plot(future_dates, actual_values, marker='s', linestyle='-',
|
label='实际盐度', color='orange', linewidth=2)
|
|
# 设置图表标题和标签
|
ax.set_xlabel('日期')
|
ax.set_ylabel('盐度')
|
ax.set_title(f"{prediction_mode}从 {start_time_dt.strftime('%Y-%m-%d %H:%M:%S')} 开始的递归单步盐度预测")
|
|
# 设置图例并应用紧凑布局
|
ax.legend(loc='best')
|
fig.tight_layout()
|
|
# 保存初始视图范围用于重置
|
global current_view
|
current_view['xlim'] = ax.get_xlim()
|
current_view['ylim'] = ax.get_ylim()
|
|
# 强制重绘
|
plt.close(fig)
|
fig.canvas.draw()
|
fig.canvas.flush_events()
|
plt.draw()
|
|
# 更新预测结果文本
|
predict_time = time() - predict_start
|
status_label.config(text=f"递归预测完成 (耗时: {predict_time:.2f}秒)")
|
|
# 显示预测结果
|
result_text = "递归单步预测结果:\n\n"
|
|
# 如果有实际值,计算差值和百分比误差
|
if actual_values is not None:
|
result_text += "日期 预测值 实际值 差值\n"
|
result_text += "--------------------------------------\n"
|
for i, (date, pred, actual) in enumerate(zip(future_dates, predictions, actual_values)):
|
if actual is not None: # 只在有实际值时显示差值
|
diff = pred - actual
|
result_text += f"{date.strftime('%Y-%m-%d')} {pred:6.2f} {actual:6.2f} {diff:6.2f}\n"
|
else:
|
result_text += f"{date.strftime('%Y-%m-%d')} {pred:6.2f} -- --\n"
|
else:
|
result_text += "日期 预测值\n"
|
result_text += "-------------------\n"
|
for i, (date, pred) in enumerate(zip(future_dates, predictions)):
|
result_text += f"{date.strftime('%Y-%m-%d')} {pred:6.2f}\n"
|
result_text += "\n无实际值进行对比"
|
|
update_result_text(result_text)
|
except Exception as e:
|
status_label.config(text=f"错误: {str(e)}")
|
import traceback
|
traceback.print_exc()
|
|
def display_history_data():
|
"""显示历史盐度数据"""
|
try:
|
# 清除图形并重新绘制
|
ax.clear()
|
|
# 获取所有历史数据
|
start_date = df['DateTime'].min()
|
end_date = df['DateTime'].max()
|
hist_data = df.copy() # 使用所有数据
|
|
# 确保数据不为空
|
if len(hist_data) == 0:
|
status_label.config(text="错误: 没有可用的历史数据")
|
return
|
|
# 绘制基本数据
|
ax.plot(hist_data['DateTime'], hist_data['downstream_smooth'],
|
label='陈行(下游)盐度', color='blue', linewidth=1.5)
|
|
# 检查source_name列是否存在,如果不存在则使用默认值
|
if 'source_name' in hist_data.columns:
|
source = hist_data["source_name"].iloc[0]
|
else:
|
# 根据当前预测模式判断上游名称
|
source = "青龙港" if prediction_mode == "青龙港-陈行" else "太仓石化"
|
|
ax.plot(hist_data['DateTime'], hist_data['upstream_smooth'],
|
label=f'{source}(上游)盐度', color='purple', linewidth=1.5, alpha=0.7)
|
|
# 添加盐度250的标注线
|
ax.axhline(y=250, color='red', linestyle='--', alpha=0.7, linewidth=1.5, label='盐度警戒线 (250)')
|
|
# 设置图表标题和标签
|
ax.set_xlabel('日期')
|
ax.set_ylabel('盐度')
|
ax.set_title(f"{prediction_mode}全部历史盐度数据 ({start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')})")
|
|
# 设置图例并应用紧凑布局
|
ax.legend(loc='best')
|
fig.tight_layout()
|
|
# 保存初始视图范围用于重置
|
global current_view
|
current_view['xlim'] = ax.get_xlim()
|
current_view['ylim'] = ax.get_ylim()
|
|
# 强制重绘
|
plt.close(fig)
|
fig.canvas.draw()
|
fig.canvas.flush_events()
|
plt.draw()
|
|
status_label.config(text=f"显示全部历史数据 ({len(hist_data)} 个数据点)")
|
|
# 更新结果文本
|
result_text = "历史盐度统计信息:\n\n"
|
result_text += f"数据时间范围: {start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}\n"
|
result_text += f"数据点数量: {len(hist_data)}\n\n"
|
result_text += f"{source}上游盐度:\n"
|
result_text += f" 最小值: {hist_data['upstream_smooth'].min():.2f}\n"
|
result_text += f" 最大值: {hist_data['upstream_smooth'].max():.2f}\n"
|
result_text += f" 平均值: {hist_data['upstream_smooth'].mean():.2f}\n"
|
result_text += f" 标准差: {hist_data['upstream_smooth'].std():.2f}\n\n"
|
result_text += "陈行下游盐度:\n"
|
result_text += f" 最小值: {hist_data['downstream_smooth'].min():.2f}\n"
|
result_text += f" 最大值: {hist_data['downstream_smooth'].max():.2f}\n"
|
result_text += f" 平均值: {hist_data['downstream_smooth'].mean():.2f}\n"
|
result_text += f" 标准差: {hist_data['downstream_smooth'].std():.2f}\n"
|
|
update_result_text(result_text)
|
except Exception as e:
|
status_label.config(text=f"显示历史数据时出错: {str(e)}")
|
import traceback
|
traceback.print_exc()
|
|
def on_scroll(event):
|
xlim = ax.get_xlim()
|
ylim = ax.get_ylim()
|
zoom_factor = 1.1
|
x_data = event.xdata if event.xdata is not None else (xlim[0]+xlim[1])/2
|
y_data = event.ydata if event.ydata is not None else (ylim[0]+ylim[1])/2
|
x_rel = (x_data - xlim[0]) / (xlim[1] - xlim[0])
|
y_rel = (y_data - ylim[0]) / (ylim[1] - ylim[0])
|
if event.step > 0:
|
new_width = (xlim[1]-xlim[0]) / zoom_factor
|
new_height = (ylim[1]-ylim[0]) / zoom_factor
|
x0 = x_data - x_rel * new_width
|
y0 = y_data - y_rel * new_height
|
ax.set_xlim([x0, x0+new_width])
|
ax.set_ylim([y0, y0+new_height])
|
else:
|
new_width = (xlim[1]-xlim[0]) * zoom_factor
|
new_height = (ylim[1]-ylim[0]) * zoom_factor
|
x0 = x_data - x_rel * new_width
|
y0 = y_data - y_rel * new_height
|
ax.set_xlim([x0, x0+new_width])
|
ax.set_ylim([y0, y0+new_height])
|
canvas.draw_idle()
|
|
def update_cursor(event):
|
if event.inaxes == ax:
|
canvas.get_tk_widget().config(cursor="fleur")
|
else:
|
canvas.get_tk_widget().config(cursor="")
|
|
def reset_view():
|
global current_view
|
if current_view['xlim'] is not None:
|
# 应用保存的视图范围
|
ax.set_xlim(current_view['xlim'])
|
ax.set_ylim(current_view['ylim'])
|
|
# 应用紧凑布局并重绘
|
fig.tight_layout()
|
canvas.draw_idle()
|
status_label.config(text="图表视图已重置")
|
else:
|
status_label.config(text="没有可用的初始视图范围")
|
|
root = tk.Tk()
|
root.title(f"{prediction_mode}盐度预测系统")
|
try:
|
configure_gui_fonts()
|
except Exception as e:
|
print("字体配置异常:", e)
|
|
# 恢复输入框和控制按钮
|
input_frame = ttk.Frame(root, padding="10")
|
input_frame.pack(fill=tk.X)
|
|
ttk.Label(input_frame, text="输入开始时间 (YYYY-MM-DD HH:MM:SS)").pack(side=tk.LEFT)
|
entry = ttk.Entry(input_frame, width=25)
|
entry.pack(side=tk.LEFT, padx=5)
|
predict_button = ttk.Button(input_frame, text="预测", command=on_predict)
|
predict_button.pack(side=tk.LEFT)
|
status_label = ttk.Label(input_frame, text="提示: 第一次运行请勾选'强制重新训练模型'")
|
status_label.pack(side=tk.LEFT, padx=10)
|
|
control_frame = ttk.Frame(root, padding="5")
|
control_frame.pack(fill=tk.X)
|
retrain_var = tk.BooleanVar(value=False)
|
ttk.Checkbutton(control_frame, text="强制重新训练模型", variable=retrain_var).pack(side=tk.LEFT)
|
|
# 添加显示历史数据按钮
|
history_button = ttk.Button(control_frame, text="显示历史数据", command=display_history_data)
|
history_button.pack(side=tk.LEFT, padx=5)
|
|
# 添加切换数据源的按钮
|
switch_button = ttk.Button(control_frame, text="切换到太仓石化-陈行", command=switch_prediction_mode)
|
switch_button.pack(side=tk.LEFT, padx=5)
|
|
# 更新图例说明,添加盐度警戒线信息
|
legend_label = ttk.Label(control_frame, text="图例: 紫色=上游数据, 蓝色=下游数据, 红色=预测值, 橙色=实际值, 红色虚线=盐度警戒线(250)")
|
legend_label.pack(side=tk.LEFT, padx=10)
|
reset_button = ttk.Button(control_frame, text="重置视图", command=reset_view)
|
reset_button.pack(side=tk.LEFT, padx=5)
|
|
# 添加显示模型准确度的标签
|
metrics_frame = ttk.Frame(root, padding="5")
|
metrics_frame.pack(fill=tk.X)
|
model_metrics = get_model_metrics()
|
metrics_text = "模型准确度: 未知" if not model_metrics else f"模型准确度 - RMSE: {model_metrics['rmse']:.4f}, MAE: {model_metrics['mae']:.4f}"
|
metrics_label = ttk.Label(metrics_frame, text=metrics_text)
|
metrics_label.pack(side=tk.LEFT, padx=10)
|
|
# 结果显示区域
|
result_frame = ttk.Frame(root, padding="10")
|
result_frame.pack(fill=tk.BOTH, expand=True)
|
|
# 左侧放置图表
|
plot_frame = ttk.Frame(result_frame, width=800, height=600)
|
plot_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
|
plot_frame.pack_propagate(False) # 不允许框架根据内容调整大小
|
|
# 右侧放置文本结果
|
text_frame = ttk.Frame(result_frame)
|
text_frame.pack(side=tk.RIGHT, fill=tk.Y)
|
|
# 使用等宽字体显示结果
|
result_font = tkfont.Font(family="Courier New", size=10, weight="normal")
|
|
# 添加文本框和滚动条
|
result_text = tk.Text(text_frame, width=50, height=25, font=result_font, wrap=tk.NONE)
|
result_text.pack(side=tk.LEFT, fill=tk.BOTH)
|
result_scroll = ttk.Scrollbar(text_frame, orient="vertical", command=result_text.yview)
|
result_scroll.pack(side=tk.RIGHT, fill=tk.Y)
|
result_text.configure(yscrollcommand=result_scroll.set)
|
result_text.configure(state=tk.DISABLED) # 初始设为只读
|
|
# 更新结果文本的函数
|
def update_result_text(text):
|
result_text.configure(state=tk.NORMAL)
|
result_text.delete(1.0, tk.END)
|
result_text.insert(tk.END, text)
|
result_text.configure(state=tk.DISABLED)
|
|
# 创建更高DPI的图形以获得更好的显示质量
|
fig, ax = plt.subplots(figsize=(10, 6), dpi=100)
|
fig.tight_layout(pad=3.0) # 增加内边距,防止标签被截断
|
|
# 创建画布并添加到固定大小的框架
|
canvas = FigureCanvasTkAgg(fig, master=plot_frame)
|
canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=True)
|
|
# 添加工具栏,包含缩放、保存等功能
|
toolbar_frame = ttk.Frame(plot_frame)
|
toolbar_frame.pack(side=tk.BOTTOM, fill=tk.X)
|
toolbar = NavigationToolbar2Tk(canvas, toolbar_frame)
|
toolbar.update()
|
|
# 启用紧凑布局,并设置自动调整以使图表完全显示
|
def on_resize(event):
|
fig.tight_layout()
|
canvas.draw_idle()
|
|
# 添加图表交互功能
|
canvas.mpl_connect('resize_event', on_resize)
|
canvas.mpl_connect('scroll_event', on_scroll)
|
canvas.mpl_connect('motion_notify_event', update_cursor)
|
|
# 添加鼠标拖动功能
|
def on_press(event):
|
if event.inaxes != ax:
|
return
|
canvas.get_tk_widget().config(cursor="fleur")
|
ax._pan_start = (event.x, event.y, event.xdata, event.ydata)
|
|
def on_release(event):
|
ax._pan_start = None
|
canvas.get_tk_widget().config(cursor="")
|
canvas.draw_idle()
|
|
def on_motion(event):
|
if not hasattr(ax, '_pan_start') or ax._pan_start is None:
|
return
|
if event.inaxes != ax:
|
return
|
|
start_x, start_y, x_data, y_data = ax._pan_start
|
dx = event.x - start_x
|
dy = event.y - start_y
|
|
# 获取当前视图
|
xlim = ax.get_xlim()
|
ylim = ax.get_ylim()
|
|
# 计算图表坐标系中的移动
|
x_scale = (xlim[1] - xlim[0]) / canvas.get_tk_widget().winfo_width()
|
y_scale = (ylim[1] - ylim[0]) / canvas.get_tk_widget().winfo_height()
|
|
# 更新视图
|
ax.set_xlim(xlim[0] - dx * x_scale, xlim[1] - dx * x_scale)
|
ax.set_ylim(ylim[0] + dy * y_scale, ylim[1] + dy * y_scale)
|
|
# 更新拖动起点
|
ax._pan_start = (event.x, event.y, event.xdata, event.ydata)
|
|
canvas.draw_idle()
|
|
# 连接鼠标事件
|
canvas.mpl_connect('button_press_event', on_press)
|
canvas.mpl_connect('button_release_event', on_release)
|
canvas.mpl_connect('motion_notify_event', on_motion)
|
|
# 初始显示历史数据
|
display_history_data()
|
|
root.mainloop()
|
|
# 向量化构造训练样本(优化特征工程)
|
# -------------------------------
|
def create_features_vectorized(df, look_back=168, forecast_horizon=24):
|
"""
|
向量化构造训练样本,使用过去look_back小时的数据预测未来forecast_horizon小时的下游盐度均值
|
增强版:添加更多的时间特征和统计特征,并进行特征交叉
|
"""
|
try:
|
# 确保数据按时间排序
|
df = df.sort_values('DateTime')
|
|
# 获取所有数值列(排除DateTime列)
|
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
if 'DateTime' in numeric_columns:
|
numeric_columns.remove('DateTime')
|
|
# 过滤掉不需要的特征,减少特征维度
|
exclude_columns = ['source_name', 'hour', 'weekday', 'month'] # 避免重复特征
|
numeric_columns = [col for col in numeric_columns if col not in exclude_columns
|
and not (col.endswith('_sin') or col.endswith('_cos'))]
|
|
# 初始化特征和标签列表
|
features = [] # x输入
|
targets = [] # y输出
|
|
# 使用滑动窗口创建样本
|
for i in range(len(df) - look_back - forecast_horizon + 1):
|
# 获取特征窗口和目标窗口
|
window = df.iloc[i:i+look_back]
|
target_window = df.iloc[i+look_back:i+look_back+forecast_horizon]
|
|
# 提取基本特征 - 选择关键列减少维度
|
key_columns = ['upstream_smooth', 'downstream_smooth']
|
|
# 添加平滑数据和趋势数据
|
if 'upstream_trend_24h' in numeric_columns:
|
key_columns.extend(['upstream_trend_24h', 'downstream_trend_24h'])
|
|
# 添加水位和流量相关特征(如果存在)
|
if 'water_level_smooth' in numeric_columns:
|
key_columns.append('water_level_smooth')
|
if 'flow_smooth' in numeric_columns:
|
key_columns.append('flow_smooth')
|
|
# 提取关键列的原始数据
|
window_features = []
|
column_values = {} # 存储各列的值用于后续交叉特征
|
|
for col in key_columns:
|
if col in window.columns:
|
# 获取列数据并处理NaN值
|
col_values = window[col].fillna(method='ffill').fillna(method='bfill').values
|
window_features.extend(col_values)
|
column_values[col] = col_values
|
|
# 添加时间特征
|
last_date = window['DateTime'].iloc[-1]
|
# 季节性特征 - 使用正弦和余弦变换捕获周期性
|
month = last_date.month
|
day = last_date.day
|
hour = last_date.hour
|
# 年内的日期 (1-366)
|
day_of_year = last_date.dayofyear
|
# 转换为周期性特征
|
window_features.extend([
|
# 月份的周期性
|
np.sin(2 * np.pi * month / 12),
|
np.cos(2 * np.pi * month / 12),
|
# 一月内的日期周期性
|
np.sin(2 * np.pi * day / 31),
|
np.cos(2 * np.pi * day / 31),
|
# 一天内的小时周期性
|
np.sin(2 * np.pi * hour / 24),
|
np.cos(2 * np.pi * hour / 24),
|
# 一年内的日期周期性
|
np.sin(2 * np.pi * day_of_year / 366),
|
np.cos(2 * np.pi * day_of_year / 366),
|
# 线性时间特征,捕捉趋势
|
day_of_year / 366.0,
|
# 星期几,捕捉每周模式
|
last_date.dayofweek / 7.0
|
])
|
|
# 保存时间特征用于交叉计算
|
time_features = {
|
'month_sin': np.sin(2 * np.pi * month / 12),
|
'month_cos': np.cos(2 * np.pi * month / 12),
|
'day_sin': np.sin(2 * np.pi * day / 31),
|
'day_cos': np.cos(2 * np.pi * day / 31),
|
'hour_sin': np.sin(2 * np.pi * hour / 24),
|
'hour_cos': np.cos(2 * np.pi * hour / 24),
|
'day_of_year': day_of_year / 366.0,
|
'weekday': last_date.dayofweek / 7.0
|
}
|
|
# 添加统计特征 - 对关键数据计算统计量
|
stats_features = {}
|
for col in ['upstream_smooth', 'downstream_smooth']:
|
if col in window.columns:
|
values = window[col].values
|
# 添加窗口内的统计特征
|
mean_val = np.mean(values)
|
std_val = np.std(values)
|
max_val = np.max(values)
|
min_val = np.min(values)
|
last_val = values[-1]
|
mean_24h = np.mean(values[-24:]) if len(values) >= 24 else mean_val
|
|
window_features.extend([
|
mean_val, # 平均值
|
std_val, # 标准差
|
max_val, # 最大值
|
min_val, # 最小值
|
last_val, # 最近值
|
mean_24h # 最近24小时平均值
|
])
|
|
# 存储统计特征用于交叉计算
|
stats_features[f'{col}_mean'] = mean_val
|
stats_features[f'{col}_std'] = std_val
|
stats_features[f'{col}_max'] = max_val
|
stats_features[f'{col}_min'] = min_val
|
stats_features[f'{col}_last'] = last_val
|
stats_features[f'{col}_mean_24h'] = mean_24h
|
|
# 添加差分特征(捕获趋势变化)
|
if len(values) > 1:
|
diff_1 = values[-1] - values[-2] # 一阶差分
|
window_features.append(diff_1)
|
stats_features[f'{col}_diff_1'] = diff_1
|
else:
|
window_features.append(0)
|
stats_features[f'{col}_diff_1'] = 0
|
|
if len(values) > 24:
|
diff_24 = values[-1] - values[-25] # 24小时差分
|
window_features.append(diff_24)
|
stats_features[f'{col}_diff_24'] = diff_24
|
else:
|
window_features.append(0)
|
stats_features[f'{col}_diff_24'] = 0
|
|
# 添加潮汐特征(如果存在)
|
tidal_features = {}
|
if 'lunar_phase_sin' in window.columns and 'lunar_phase_cos' in window.columns:
|
lunar_sin = window['lunar_phase_sin'].iloc[-1]
|
lunar_cos = window['lunar_phase_cos'].iloc[-1]
|
window_features.extend([lunar_sin, lunar_cos])
|
tidal_features['lunar_sin'] = lunar_sin
|
tidal_features['lunar_cos'] = lunar_cos
|
|
if 'is_high_tide' in window.columns:
|
is_high_tide = window['is_high_tide'].iloc[-1]
|
window_features.append(is_high_tide)
|
tidal_features['is_high_tide'] = is_high_tide
|
|
# 创建特征交叉
|
# 1. 上下游盐度比例和差值 - 捕捉盐度梯度关系
|
if 'upstream_smooth' in column_values and 'downstream_smooth' in column_values:
|
up_last = column_values['upstream_smooth'][-1]
|
down_last = column_values['downstream_smooth'][-1]
|
|
if up_last > 0 and down_last > 0:
|
# 盐度比
|
ratio = down_last / up_last
|
window_features.append(ratio)
|
|
# 盐度差
|
diff = down_last - up_last
|
window_features.append(diff)
|
|
# 盐度变化率
|
if len(column_values['upstream_smooth']) > 24 and len(column_values['downstream_smooth']) > 24:
|
up_24h_ago = column_values['upstream_smooth'][-25]
|
down_24h_ago = column_values['downstream_smooth'][-25]
|
|
if up_24h_ago > 0 and down_24h_ago > 0:
|
up_change_rate = (up_last - up_24h_ago) / up_24h_ago
|
down_change_rate = (down_last - down_24h_ago) / down_24h_ago
|
window_features.extend([up_change_rate, down_change_rate])
|
|
# 上下游变化率之差
|
rate_diff = down_change_rate - up_change_rate
|
window_features.append(rate_diff)
|
|
# 2. 时间与盐度交叉 - 不同时间段的盐度特性
|
for col in ['upstream_smooth', 'downstream_smooth']:
|
if col in stats_features:
|
# 月份与盐度交叉 - 捕捉季节性与盐度关系
|
window_features.append(stats_features[f'{col}_last'] * time_features['month_sin'])
|
window_features.append(stats_features[f'{col}_last'] * time_features['month_cos'])
|
|
# 日内时间与盐度交叉 - 捕捉日内变化与盐度关系
|
window_features.append(stats_features[f'{col}_last'] * time_features['hour_sin'])
|
window_features.append(stats_features[f'{col}_last'] * time_features['hour_cos'])
|
|
# 3. 潮汐与盐度交叉 - 潮汐对盐度的影响
|
if 'lunar_sin' in tidal_features and 'downstream_smooth' in stats_features:
|
window_features.append(tidal_features['lunar_sin'] * stats_features['downstream_smooth_last'])
|
window_features.append(tidal_features['lunar_cos'] * stats_features['downstream_smooth_last'])
|
|
if 'is_high_tide' in tidal_features:
|
window_features.append(tidal_features['is_high_tide'] * stats_features['downstream_smooth_last'])
|
|
# 4. 水位/流量与盐度交叉(如果存在)
|
if 'water_level_smooth' in column_values and 'downstream_smooth' in stats_features:
|
water_level = column_values['water_level_smooth'][-1]
|
window_features.append(water_level * stats_features['downstream_smooth_last'])
|
|
if 'flow_smooth' in column_values and 'downstream_smooth' in stats_features:
|
flow = column_values['flow_smooth'][-1]
|
window_features.append(flow * stats_features['downstream_smooth_last'])
|
|
# 获取目标值(未来预测时段的下游盐度均值)
|
if len(target_window) > 0:
|
# 处理目标值中的NaN
|
target_values = target_window['downstream_smooth'].fillna(method='ffill').fillna(method='bfill').values
|
if len(target_values) > 0:
|
target = np.mean(target_values)
|
|
# 检查特征和目标值是否有效
|
if not np.any(np.isnan(window_features)) and not np.isnan(target) and not np.isinf(target):
|
features.append(window_features)
|
targets.append(target)
|
|
if not features:
|
print("警告: 未能生成任何有效特征")
|
return np.array([]), np.array([])
|
|
# 转换为numpy数组
|
X = np.array(features)
|
y = np.array(targets)
|
|
print(f"成功生成特征矩阵,形状: {X.shape},特征数量: {X.shape[1]}")
|
|
# 实现数据增强 - 稀疏样本重采样
|
print("正在进行数据增强...")
|
original_X, original_y = X.copy(), y.copy()
|
|
# 获取高盐度样本索引 (通常较少)
|
high_salinity_idx = np.where(original_y > np.percentile(original_y, 75))[0]
|
print(f"原始数据中高盐度样本数量: {len(high_salinity_idx)}")
|
|
if len(high_salinity_idx) > 10:
|
# 对高盐度样本进行过采样
|
augmented_X = []
|
augmented_y = []
|
|
# 添加原始数据
|
augmented_X.append(original_X)
|
augmented_y.append(original_y)
|
|
# 对高盐度样本添加随机噪声进行数据增强
|
high_X = original_X[high_salinity_idx]
|
high_y = original_y[high_salinity_idx]
|
|
# 添加少量噪声的增强样本
|
noise_level = 0.02
|
for _ in range(2): # 增加2倍的高盐度样本
|
noise = np.random.normal(0, noise_level, high_X.shape)
|
augmented_X.append(high_X + noise)
|
augmented_y.append(high_y)
|
|
# 合并所有样本
|
X = np.vstack(augmented_X)
|
y = np.concatenate(augmented_y)
|
|
print(f"数据增强后样本数量: {len(X)}")
|
|
return X, y
|
|
except Exception as e:
|
print(f"特征创建异常: {e}")
|
return np.array([]), np.array([])
|
|
def generate_prediction_features(df, current_date, look_back=168):
|
"""
|
为预测生成特征,使用增强的特征生成逻辑,与训练数据使用相同的特征工程
|
"""
|
try:
|
# 确保数据按时间排序
|
df = df.sort_values('DateTime')
|
|
# 获取所有数值列(排除DateTime列)
|
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
if 'DateTime' in numeric_columns:
|
numeric_columns.remove('DateTime')
|
|
# 过滤掉不需要的特征,减少特征维度
|
exclude_columns = ['source_name', 'hour', 'weekday', 'month'] # 避免重复特征
|
numeric_columns = [col for col in numeric_columns if col not in exclude_columns
|
and not (col.endswith('_sin') or col.endswith('_cos'))]
|
|
# 找到当前日期在数据中的位置
|
current_idx = df[df['DateTime'] <= current_date].index[-1]
|
|
# 获取过去look_back小时的数据窗口
|
if current_idx < look_back - 1:
|
print(f"数据不足,需要{look_back}小时的数据,但只有{current_idx+1}小时")
|
return None
|
|
window = df.iloc[current_idx-look_back+1:current_idx+1]
|
|
# 提取基本特征 - 选择关键列减少维度
|
key_columns = ['upstream_smooth', 'downstream_smooth']
|
|
# 添加平滑数据和趋势数据
|
if 'upstream_trend_24h' in numeric_columns:
|
key_columns.extend(['upstream_trend_24h', 'downstream_trend_24h'])
|
|
# 添加水位和流量相关特征(如果存在)
|
if 'water_level_smooth' in numeric_columns:
|
key_columns.append('water_level_smooth')
|
if 'flow_smooth' in numeric_columns:
|
key_columns.append('flow_smooth')
|
|
# 提取关键列的原始数据
|
features = []
|
column_values = {} # 存储各列的值用于后续交叉特征
|
|
for col in key_columns:
|
if col in window.columns:
|
# 获取列数据并处理NaN值
|
col_values = window[col].fillna(method='ffill').fillna(method='bfill').values
|
features.extend(col_values)
|
|
# 添加时间特征
|
# 季节性特征 - 使用正弦和余弦变换捕获周期性
|
month = current_date.month
|
day = current_date.day
|
hour = current_date.hour
|
# 年内的日期 (1-366)
|
day_of_year = current_date.dayofyear
|
# 转换为周期性特征
|
features.extend([
|
# 月份的周期性
|
np.sin(2 * np.pi * month / 12),
|
np.cos(2 * np.pi * month / 12),
|
# 一月内的日期周期性
|
np.sin(2 * np.pi * day / 31),
|
np.cos(2 * np.pi * day / 31),
|
# 一天内的小时周期性
|
np.sin(2 * np.pi * hour / 24),
|
np.cos(2 * np.pi * hour / 24),
|
# 一年内的日期周期性
|
np.sin(2 * np.pi * day_of_year / 366),
|
np.cos(2 * np.pi * day_of_year / 366),
|
# 线性时间特征,捕捉趋势
|
day_of_year / 366.0,
|
# 星期几,捕捉每周模式
|
current_date.dayofweek / 7.0
|
])
|
|
# 添加统计特征 - 对关键数据计算统计量
|
for col in ['upstream_smooth', 'downstream_smooth']:
|
if col in window.columns:
|
values = window[col].values
|
# 添加窗口内的统计特征
|
features.extend([
|
np.mean(values), # 平均值
|
np.std(values), # 标准差
|
np.max(values), # 最大值
|
np.min(values), # 最小值
|
values[-1], # 最近值
|
np.mean(values[-24:]) # 最近24小时平均值
|
])
|
|
# 添加差分特征(捕获趋势变化)
|
if len(values) > 1:
|
diff_1 = values[-1] - values[-2] # 一阶差分
|
features.append(diff_1)
|
else:
|
features.append(0)
|
|
if len(values) > 24:
|
diff_24 = values[-1] - values[-25] # 24小时差分
|
features.append(diff_24)
|
else:
|
features.append(0)
|
|
# 添加潮汐特征(如果存在)
|
if 'lunar_phase_sin' in window.columns and 'lunar_phase_cos' in window.columns:
|
features.extend([
|
window['lunar_phase_sin'].iloc[-1],
|
window['lunar_phase_cos'].iloc[-1]
|
])
|
|
if 'is_high_tide' in window.columns:
|
features.append(window['is_high_tide'].iloc[-1])
|
|
return np.array(features)
|
|
except Exception as e:
|
print(f"预测特征生成异常: {e}")
|
return None
|
|
# 主函数
|
def main():
|
global df, current_df, qinglong_df, taicang_df
|
# 加载两个数据集
|
qinglong_df, taicang_df = load_both_datasets()
|
current_df = qinglong_df # 默认使用青龙港-陈行数据集
|
|
if current_df is not None:
|
df = current_df # 设置当前使用的数据集
|
run_gui()
|
else:
|
print("数据加载失败,无法运行预测。")
|
|
# 在程序入口处调用main函数
|
if __name__ == "__main__":
|
main()
|