# xgboost修改版本 import os import pickle import pandas as pd import numpy as np import tkinter as tk import tkinter.font as tkfont from tkinter import ttk from datetime import timedelta from time import time import matplotlib.pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk from xgboost import XGBRegressor from lunardate import LunarDate from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error import matplotlib # 配置 matplotlib 中文显示 matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS'] matplotlib.rcParams['axes.unicode_minus'] = False matplotlib.rcParams['font.family'] = 'sans-serif' # 全局缓存变量及特征名称(此处 feature_columns 仅为占位) cached_model = None last_training_time = None feature_columns = None # 数据加载与预处理函数 # ------------------------------- def load_data(upstream_file, downstream_file): """ 加载所有相关数据 """ try: upstream_df = pd.read_csv(upstream_file) downstream_df = pd.read_csv(downstream_file) except FileNotFoundError: print("文件未找到,请检查路径") return None # 假设原始数据列依次为 ['DateTime', 'TagName', 'Value'] upstream_df.columns = ['DateTime', 'TagName', 'Value'] downstream_df.columns = ['DateTime', 'TagName', 'Value'] # 转换时间格式及数值处理 upstream_df['DateTime'] = pd.to_datetime(upstream_df['DateTime']) downstream_df['DateTime'] = pd.to_datetime(downstream_df['DateTime']) # 数值处理 upstream_df['Value'] = pd.to_numeric(upstream_df['Value'], errors='coerce') downstream_df['Value'] = pd.to_numeric(downstream_df['Value'], errors='coerce') # 过滤盐度小于5的数据 这里数据可以更改 upstream_df = upstream_df[upstream_df['Value'] >= 5] downstream_df = downstream_df[downstream_df['Value'] >= 5] # 将0替换为NaN,并利用3倍标准差法处理异常值 数据处理平滑 for df in [upstream_df, downstream_df]: df.loc[df['Value'] == 0, 'Value'] = np.nan mean_val, std_val = df['Value'].mean(), df['Value'].std() lower_bound, upper_bound = mean_val - 3 * std_val, mean_val + 3 * std_val df.loc[(df['Value'] < lower_bound) | (df['Value'] > upper_bound), 'Value'] = np.nan # 重命名 Value 列并保留需要的列 upstream_df = upstream_df.rename(columns={'Value': 'upstream'})[['DateTime', 'upstream']] downstream_df = downstream_df.rename(columns={'Value': 'downstream'})[['DateTime', 'downstream']] # 合并数据 merged_df = pd.merge(upstream_df, downstream_df, on='DateTime', how='inner') print(f"合并前数据行数: {len(merged_df)}") merged_df = merged_df.set_index('DateTime') # 插值:先用线性,再用时间插值,最后用前向后向填充 merged_df['upstream'] = merged_df['upstream'].interpolate(method='linear', limit=4) merged_df['downstream'] = merged_df['downstream'].interpolate(method='linear', limit=4) merged_df['upstream'] = merged_df['upstream'].interpolate(method='time', limit=24) merged_df['downstream'] = merged_df['downstream'].interpolate(method='time', limit=24) merged_df['upstream'] = merged_df['upstream'].fillna(method='ffill').fillna(method='bfill') merged_df['downstream'] = merged_df['downstream'].fillna(method='ffill').fillna(method='bfill') # 平滑处理:使用滑动窗口移动平均 merged_df['upstream_smooth'] = merged_df['upstream'].rolling(window=24, min_periods=1, center=True).mean() merged_df['downstream_smooth'] = merged_df['downstream'].rolling(window=24, min_periods=1, center=True).mean() # 对低盐度部分用更大窗口平滑 low_sal_mask = merged_df['upstream'] < 50 if low_sal_mask.any(): merged_df.loc[low_sal_mask, 'upstream_smooth'] = merged_df.loc[low_sal_mask, 'upstream']\ .rolling(window=48, min_periods=1, center=True).mean() merged_df = merged_df.dropna() merged_df = merged_df[merged_df['upstream'].apply(np.isfinite)] merged_df = merged_df[merged_df['downstream'].apply(np.isfinite)] merged_df = merged_df.reset_index() print(f"清洗后数据行数: {len(merged_df)}") print(f"上游盐度范围: {merged_df['upstream'].min()} - {merged_df['upstream'].max()}") print(f"下游盐度范围: {merged_df['downstream'].min()} - {merged_df['downstream'].max()}") merged_df = merged_df.sort_values('DateTime') return merged_df # 测试 # df = load_data('青龙港1.csv', '一取水.csv') # df.to_csv('merged_data.csv', index=False) # print(f"Merged data saved to 'merged_data.csv' successfully") # # 绘制盐度随时间变化图 # plt.figure(figsize=(12, 6)) # plt.plot(df['DateTime'], df['upstream_smooth'], label='上游盐度', color='blue') # plt.plot(df['DateTime'], df['downstream_smooth'], label='下游盐度', color='red') # plt.xlabel('时间') # plt.ylabel('盐度') # plt.title('盐度随时间变化图') # plt.legend() # plt.grid(True) # plt.tight_layout() # plt.savefig('salinity_time_series.png', dpi=300) # plt.show() # ----------------------特征工程部分 # ------------------------------- # 添加农历(潮汐)特征 # ------------------------------- def add_lunar_features(df): lunar_day, lunar_phase_sin, lunar_phase_cos, is_high_tide = [], [], [], [] for dt in df['DateTime']: ld = LunarDate.fromSolarDate(dt.year, dt.month, dt.day) lunar_day.append(ld.day) lunar_phase_sin.append(np.sin(2 * np.pi * ld.day / 15)) lunar_phase_cos.append(np.cos(2 * np.pi * ld.day / 15)) is_high_tide.append(1 if (ld.day <= 5 or (ld.day >= 16 and ld.day <= 20)) else 0) df['lunar_day'] = lunar_day df['lunar_phase_sin'] = lunar_phase_sin df['lunar_phase_cos'] = lunar_phase_cos df['is_high_tide'] = is_high_tide return df # ------------------------------- # 生成延迟特征(向量化,利用 shift) # ------------------------------- def batch_create_delay_features(df, delay_hours): for delay in delay_hours: df[f'upstream_delay_{delay}h'] = df['upstream_smooth'].shift(delay) df[f'downstream_delay_{delay}h'] = df['downstream_smooth'].shift(delay) return df # ------------------------------- # 添加时间特征 # ------------------------------- def add_time_features(df): df['hour'] = df['DateTime'].dt.hour df['weekday'] = df['DateTime'].dt.dayofweek df['month'] = df['DateTime'].dt.month return df # ------------------------------- # 添加统计特征 # ------------------------------- def add_statistical_features(df): # 1天统计特征 df['mean_1d_up'] = df['upstream_smooth'].rolling(window=24).mean() df['std_1d_up'] = df['upstream_smooth'].rolling(window=24).std() df['max_1d_up'] = df['upstream_smooth'].rolling(window=24).max() df['min_1d_up'] = df['upstream_smooth'].rolling(window=24).min() df['mean_1d_down'] = df['downstream_smooth'].rolling(window=24).mean() df['std_1d_down'] = df['downstream_smooth'].rolling(window=24).std() df['max_1d_down'] = df['downstream_smooth'].rolling(window=24).max() df['min_1d_down'] = df['downstream_smooth'].rolling(window=24).min() # 3天统计特征 df['mean_3d_up'] = df['upstream_smooth'].rolling(window=72).mean() df['mean_3d_down'] = df['downstream_smooth'].rolling(window=72).mean() return df # 应用特征工程并保存数据 if __name__ == "__main__": df = load_data('青龙港1.csv', '一取水.csv') # 添加时间特征 df = add_time_features(df) # 添加农历特征 df = add_lunar_features(df) # 添加统计特征 df = add_statistical_features(df) # 添加延迟特征 - 设置延迟小时数为1,2,3,6,12,24,48,72 delay_hours = [1, 2, 3, 6, 12, 24, 48, 72] df = batch_create_delay_features(df, delay_hours) # # 保存带有全部特征的数据 # df.to_csv('feature_engineered_data.csv', index=False) # print(f"特征工程后的数据已保存到 'feature_engineered_data.csv',共{len(df)}行,{len(df.columns)}列") # 清除NaN值 df_clean = df.dropna() print(f"删除NaN后的数据行数: {len(df_clean)}") # 进行特征相关性分析 print("\n进行特征相关性分析...") # 选择数值型列进行相关性分析 numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist() # 排除DateTime列 if 'DateTime' in numeric_cols: numeric_cols.remove('DateTime') # 计算相关矩阵 corr_matrix = df_clean[numeric_cols].corr() # 保存相关矩阵到CSV corr_matrix.to_csv('feature_correlation_matrix.csv') print("相关矩阵已保存到 'feature_correlation_matrix.csv'") # 1. 计算与下游盐度(目标变量)的相关性 target_corrs = corr_matrix['downstream_smooth'].sort_values(ascending=False) target_corrs.to_csv('target_correlation.csv') print("\n与下游盐度最相关的前10个特征:") print(target_corrs.head(10)) # 2. 绘制相关性热图 plt.figure(figsize=(16, 14)) import seaborn as sns sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5) plt.title('特征相关性热图', fontsize=16) plt.tight_layout() plt.savefig('correlation_heatmap.png', dpi=300) plt.close() print("相关性热图已保存到 'correlation_heatmap.png'") # 3. 绘制与目标变量相关性最高的前15个特征的条形图 plt.figure(figsize=(12, 8)) target_corrs.iloc[1:16].plot(kind='barh', color='skyblue') # 排除自身相关性(=1) plt.title('与下游盐度相关性最高的15个特征', fontsize=14) plt.xlabel('相关系数', fontsize=12) plt.tight_layout() plt.savefig('top_correlations.png', dpi=300) plt.close() print("目标相关性条形图已保存到 'top_correlations.png'") # 4. 检测高度相关的特征对 (相关系数>0.9) high_corr_pairs = [] for i in range(len(corr_matrix.columns)): for j in range(i): if abs(corr_matrix.iloc[i, j]) > 0.9: high_corr_pairs.append( (corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]) ) high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Feature1', 'Feature2', 'Correlation']) high_corr_df = high_corr_df.sort_values('Correlation', ascending=False) high_corr_df.to_csv('high_correlation_pairs.csv', index=False) print(f"\n发现{len(high_corr_pairs)}对高度相关的特征对(|相关系数|>0.9),已保存到'high_correlation_pairs.csv'") if len(high_corr_pairs) > 0: print("\n高度相关的特征对示例:") print(high_corr_df.head(5)) print("\n相关性分析完成,可以基于结果进行特征选择或降维。") # 保存带有全部特征的清洗后数据 df_clean.to_csv('cleaned_feature_data.csv', index=False) print(f"\n清洗后的特征数据已保存到 'cleaned_feature_data.csv',共{len(df_clean)}行,{len(df_clean.columns)}列") # 生成好的数据送入模型训练 # ------------------------------- # 模型训练与预测,展示验证准确度(RMSE, MAE) # -------------------------------