# xgboost修改版本
|
import os
|
import pickle
|
import pandas as pd
|
import numpy as np
|
import tkinter as tk
|
import tkinter.font as tkfont
|
from tkinter import ttk
|
from datetime import timedelta
|
from time import time
|
import matplotlib.pyplot as plt
|
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
|
from xgboost import XGBRegressor
|
from lunardate import LunarDate
|
from sklearn.model_selection import train_test_split
|
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
import matplotlib
|
|
# 配置 matplotlib 中文显示
|
matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS']
|
matplotlib.rcParams['axes.unicode_minus'] = False
|
matplotlib.rcParams['font.family'] = 'sans-serif'
|
|
# 全局缓存变量及特征名称(此处 feature_columns 仅为占位)
|
cached_model = None
|
last_training_time = None
|
feature_columns = None
|
|
# 数据加载与预处理函数
|
# -------------------------------
|
def load_data(upstream_file, downstream_file):
|
"""
|
加载所有相关数据
|
"""
|
try:
|
upstream_df = pd.read_csv(upstream_file)
|
downstream_df = pd.read_csv(downstream_file)
|
except FileNotFoundError:
|
print("文件未找到,请检查路径")
|
return None
|
|
# 假设原始数据列依次为 ['DateTime', 'TagName', 'Value']
|
upstream_df.columns = ['DateTime', 'TagName', 'Value']
|
downstream_df.columns = ['DateTime', 'TagName', 'Value']
|
|
|
|
# 转换时间格式及数值处理
|
upstream_df['DateTime'] = pd.to_datetime(upstream_df['DateTime'])
|
downstream_df['DateTime'] = pd.to_datetime(downstream_df['DateTime'])
|
|
|
# 数值处理
|
upstream_df['Value'] = pd.to_numeric(upstream_df['Value'], errors='coerce')
|
downstream_df['Value'] = pd.to_numeric(downstream_df['Value'], errors='coerce')
|
|
|
|
# 过滤盐度小于5的数据 这里数据可以更改
|
upstream_df = upstream_df[upstream_df['Value'] >= 5]
|
downstream_df = downstream_df[downstream_df['Value'] >= 5]
|
|
|
# 将0替换为NaN,并利用3倍标准差法处理异常值 数据处理平滑
|
for df in [upstream_df, downstream_df]:
|
df.loc[df['Value'] == 0, 'Value'] = np.nan
|
mean_val, std_val = df['Value'].mean(), df['Value'].std()
|
lower_bound, upper_bound = mean_val - 3 * std_val, mean_val + 3 * std_val
|
df.loc[(df['Value'] < lower_bound) | (df['Value'] > upper_bound), 'Value'] = np.nan
|
|
# 重命名 Value 列并保留需要的列
|
upstream_df = upstream_df.rename(columns={'Value': 'upstream'})[['DateTime', 'upstream']]
|
downstream_df = downstream_df.rename(columns={'Value': 'downstream'})[['DateTime', 'downstream']]
|
|
|
# 合并数据
|
merged_df = pd.merge(upstream_df, downstream_df, on='DateTime', how='inner')
|
|
|
print(f"合并前数据行数: {len(merged_df)}")
|
merged_df = merged_df.set_index('DateTime')
|
|
# 插值:先用线性,再用时间插值,最后用前向后向填充
|
merged_df['upstream'] = merged_df['upstream'].interpolate(method='linear', limit=4)
|
merged_df['downstream'] = merged_df['downstream'].interpolate(method='linear', limit=4)
|
|
|
merged_df['upstream'] = merged_df['upstream'].interpolate(method='time', limit=24)
|
merged_df['downstream'] = merged_df['downstream'].interpolate(method='time', limit=24)
|
|
merged_df['upstream'] = merged_df['upstream'].fillna(method='ffill').fillna(method='bfill')
|
merged_df['downstream'] = merged_df['downstream'].fillna(method='ffill').fillna(method='bfill')
|
|
# 平滑处理:使用滑动窗口移动平均
|
merged_df['upstream_smooth'] = merged_df['upstream'].rolling(window=24, min_periods=1, center=True).mean()
|
merged_df['downstream_smooth'] = merged_df['downstream'].rolling(window=24, min_periods=1, center=True).mean()
|
|
# 对低盐度部分用更大窗口平滑
|
low_sal_mask = merged_df['upstream'] < 50
|
if low_sal_mask.any():
|
merged_df.loc[low_sal_mask, 'upstream_smooth'] = merged_df.loc[low_sal_mask, 'upstream']\
|
.rolling(window=48, min_periods=1, center=True).mean()
|
|
merged_df = merged_df.dropna()
|
merged_df = merged_df[merged_df['upstream'].apply(np.isfinite)]
|
merged_df = merged_df[merged_df['downstream'].apply(np.isfinite)]
|
|
|
merged_df = merged_df.reset_index()
|
print(f"清洗后数据行数: {len(merged_df)}")
|
print(f"上游盐度范围: {merged_df['upstream'].min()} - {merged_df['upstream'].max()}")
|
print(f"下游盐度范围: {merged_df['downstream'].min()} - {merged_df['downstream'].max()}")
|
|
merged_df = merged_df.sort_values('DateTime')
|
return merged_df
|
|
|
# 测试
|
# df = load_data('青龙港1.csv', '一取水.csv')
|
# df.to_csv('merged_data.csv', index=False)
|
# print(f"Merged data saved to 'merged_data.csv' successfully")
|
|
# # 绘制盐度随时间变化图
|
# plt.figure(figsize=(12, 6))
|
# plt.plot(df['DateTime'], df['upstream_smooth'], label='上游盐度', color='blue')
|
# plt.plot(df['DateTime'], df['downstream_smooth'], label='下游盐度', color='red')
|
# plt.xlabel('时间')
|
# plt.ylabel('盐度')
|
# plt.title('盐度随时间变化图')
|
# plt.legend()
|
# plt.grid(True)
|
# plt.tight_layout()
|
# plt.savefig('salinity_time_series.png', dpi=300)
|
# plt.show()
|
|
|
# ----------------------特征工程部分
|
|
|
# -------------------------------
|
# 添加农历(潮汐)特征
|
# -------------------------------
|
def add_lunar_features(df):
|
lunar_day, lunar_phase_sin, lunar_phase_cos, is_high_tide = [], [], [], []
|
for dt in df['DateTime']:
|
ld = LunarDate.fromSolarDate(dt.year, dt.month, dt.day)
|
lunar_day.append(ld.day)
|
lunar_phase_sin.append(np.sin(2 * np.pi * ld.day / 15))
|
lunar_phase_cos.append(np.cos(2 * np.pi * ld.day / 15))
|
is_high_tide.append(1 if (ld.day <= 5 or (ld.day >= 16 and ld.day <= 20)) else 0)
|
df['lunar_day'] = lunar_day
|
df['lunar_phase_sin'] = lunar_phase_sin
|
df['lunar_phase_cos'] = lunar_phase_cos
|
df['is_high_tide'] = is_high_tide
|
return df
|
|
|
# -------------------------------
|
# 生成延迟特征(向量化,利用 shift)
|
# -------------------------------
|
def batch_create_delay_features(df, delay_hours):
|
for delay in delay_hours:
|
df[f'upstream_delay_{delay}h'] = df['upstream_smooth'].shift(delay)
|
df[f'downstream_delay_{delay}h'] = df['downstream_smooth'].shift(delay)
|
return df
|
|
|
# -------------------------------
|
# 添加时间特征
|
# -------------------------------
|
def add_time_features(df):
|
df['hour'] = df['DateTime'].dt.hour
|
df['weekday'] = df['DateTime'].dt.dayofweek
|
df['month'] = df['DateTime'].dt.month
|
return df
|
|
|
# -------------------------------
|
# 添加统计特征
|
# -------------------------------
|
def add_statistical_features(df):
|
# 1天统计特征
|
df['mean_1d_up'] = df['upstream_smooth'].rolling(window=24).mean()
|
df['std_1d_up'] = df['upstream_smooth'].rolling(window=24).std()
|
df['max_1d_up'] = df['upstream_smooth'].rolling(window=24).max()
|
df['min_1d_up'] = df['upstream_smooth'].rolling(window=24).min()
|
|
df['mean_1d_down'] = df['downstream_smooth'].rolling(window=24).mean()
|
df['std_1d_down'] = df['downstream_smooth'].rolling(window=24).std()
|
df['max_1d_down'] = df['downstream_smooth'].rolling(window=24).max()
|
df['min_1d_down'] = df['downstream_smooth'].rolling(window=24).min()
|
|
# 3天统计特征
|
df['mean_3d_up'] = df['upstream_smooth'].rolling(window=72).mean()
|
df['mean_3d_down'] = df['downstream_smooth'].rolling(window=72).mean()
|
|
return df
|
|
|
|
# 应用特征工程并保存数据
|
if __name__ == "__main__":
|
df = load_data('青龙港1.csv', '一取水.csv')
|
|
# 添加时间特征
|
df = add_time_features(df)
|
|
# 添加农历特征
|
df = add_lunar_features(df)
|
|
# 添加统计特征
|
df = add_statistical_features(df)
|
|
# 添加延迟特征 - 设置延迟小时数为1,2,3,6,12,24,48,72
|
delay_hours = [1, 2, 3, 6, 12, 24, 48, 72]
|
df = batch_create_delay_features(df, delay_hours)
|
|
# # 保存带有全部特征的数据
|
# df.to_csv('feature_engineered_data.csv', index=False)
|
# print(f"特征工程后的数据已保存到 'feature_engineered_data.csv',共{len(df)}行,{len(df.columns)}列")
|
|
# 清除NaN值
|
df_clean = df.dropna()
|
print(f"删除NaN后的数据行数: {len(df_clean)}")
|
|
# 进行特征相关性分析
|
print("\n进行特征相关性分析...")
|
|
# 选择数值型列进行相关性分析
|
numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
|
# 排除DateTime列
|
if 'DateTime' in numeric_cols:
|
numeric_cols.remove('DateTime')
|
|
# 计算相关矩阵
|
corr_matrix = df_clean[numeric_cols].corr()
|
|
# 保存相关矩阵到CSV
|
corr_matrix.to_csv('feature_correlation_matrix.csv')
|
print("相关矩阵已保存到 'feature_correlation_matrix.csv'")
|
|
# 1. 计算与下游盐度(目标变量)的相关性
|
target_corrs = corr_matrix['downstream_smooth'].sort_values(ascending=False)
|
target_corrs.to_csv('target_correlation.csv')
|
print("\n与下游盐度最相关的前10个特征:")
|
print(target_corrs.head(10))
|
|
# 2. 绘制相关性热图
|
plt.figure(figsize=(16, 14))
|
import seaborn as sns
|
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
|
plt.title('特征相关性热图', fontsize=16)
|
plt.tight_layout()
|
plt.savefig('correlation_heatmap.png', dpi=300)
|
plt.close()
|
print("相关性热图已保存到 'correlation_heatmap.png'")
|
|
# 3. 绘制与目标变量相关性最高的前15个特征的条形图
|
plt.figure(figsize=(12, 8))
|
target_corrs.iloc[1:16].plot(kind='barh', color='skyblue') # 排除自身相关性(=1)
|
plt.title('与下游盐度相关性最高的15个特征', fontsize=14)
|
plt.xlabel('相关系数', fontsize=12)
|
plt.tight_layout()
|
plt.savefig('top_correlations.png', dpi=300)
|
plt.close()
|
print("目标相关性条形图已保存到 'top_correlations.png'")
|
|
# 4. 检测高度相关的特征对 (相关系数>0.9)
|
high_corr_pairs = []
|
for i in range(len(corr_matrix.columns)):
|
for j in range(i):
|
if abs(corr_matrix.iloc[i, j]) > 0.9:
|
high_corr_pairs.append(
|
(corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
|
)
|
|
high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Feature1', 'Feature2', 'Correlation'])
|
high_corr_df = high_corr_df.sort_values('Correlation', ascending=False)
|
high_corr_df.to_csv('high_correlation_pairs.csv', index=False)
|
print(f"\n发现{len(high_corr_pairs)}对高度相关的特征对(|相关系数|>0.9),已保存到'high_correlation_pairs.csv'")
|
if len(high_corr_pairs) > 0:
|
print("\n高度相关的特征对示例:")
|
print(high_corr_df.head(5))
|
|
print("\n相关性分析完成,可以基于结果进行特征选择或降维。")
|
|
# 保存带有全部特征的清洗后数据
|
df_clean.to_csv('cleaned_feature_data.csv', index=False)
|
print(f"\n清洗后的特征数据已保存到 'cleaned_feature_data.csv',共{len(df_clean)}行,{len(df_clean.columns)}列")
|
|
|
|
|
# 生成好的数据送入模型训练
|
|
# -------------------------------
|
# 模型训练与预测,展示验证准确度(RMSE, MAE)
|
# -------------------------------
|