From c252cdc14c0f96e497e80fcdaab71dbaf0b2e725 Mon Sep 17 00:00:00 2001 From: rp <rp@outlook.com> Date: 星期三, 09 四月 2025 23:24:08 +0800 Subject: [PATCH] 特征相关性分析 --- yd_test.py | 301 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 301 insertions(+), 0 deletions(-) diff --git a/yd_test.py b/yd_test.py new file mode 100644 index 0000000..d190435 --- /dev/null +++ b/yd_test.py @@ -0,0 +1,301 @@ +# xgboost淇敼鐗堟湰 +import os +import pickle +import pandas as pd +import numpy as np +import tkinter as tk +import tkinter.font as tkfont +from tkinter import ttk +from datetime import timedelta +from time import time +import matplotlib.pyplot as plt +from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk +from xgboost import XGBRegressor +from lunardate import LunarDate +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, mean_absolute_error +import matplotlib + +# 閰嶇疆 matplotlib 涓枃鏄剧ず +matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS'] +matplotlib.rcParams['axes.unicode_minus'] = False +matplotlib.rcParams['font.family'] = 'sans-serif' + +# 鍏ㄥ眬缂撳瓨鍙橀噺鍙婄壒寰佸悕绉帮紙姝ゅ feature_columns 浠呬负鍗犱綅锛� +cached_model = None +last_training_time = None +feature_columns = None + +# 鏁版嵁鍔犺浇涓庨澶勭悊鍑芥暟 +# ------------------------------- +def load_data(upstream_file, downstream_file): + """ + 鍔犺浇鎵�鏈夌浉鍏虫暟鎹� + """ + try: + upstream_df = pd.read_csv(upstream_file) + downstream_df = pd.read_csv(downstream_file) + except FileNotFoundError: + print("鏂囦欢鏈壘鍒帮紝璇锋鏌ヨ矾寰�") + return None + + # 鍋囪鍘熷鏁版嵁鍒椾緷娆′负 ['DateTime', 'TagName', 'Value'] + upstream_df.columns = ['DateTime', 'TagName', 'Value'] + downstream_df.columns = ['DateTime', 'TagName', 'Value'] + + + + # 杞崲鏃堕棿鏍煎紡鍙婃暟鍊煎鐞� + upstream_df['DateTime'] = pd.to_datetime(upstream_df['DateTime']) + downstream_df['DateTime'] = pd.to_datetime(downstream_df['DateTime']) + + + # 鏁板�煎鐞� + upstream_df['Value'] = pd.to_numeric(upstream_df['Value'], errors='coerce') + downstream_df['Value'] = pd.to_numeric(downstream_df['Value'], errors='coerce') + + + + # 杩囨护鐩愬害灏忎簬5鐨勬暟鎹� 杩欓噷鏁版嵁鍙互鏇存敼 + upstream_df = upstream_df[upstream_df['Value'] >= 5] + downstream_df = downstream_df[downstream_df['Value'] >= 5] + + + # 灏�0鏇挎崲涓篘aN锛屽苟鍒╃敤3鍊嶆爣鍑嗗樊娉曞鐞嗗紓甯稿�� 鏁版嵁澶勭悊骞虫粦 + for df in [upstream_df, downstream_df]: + df.loc[df['Value'] == 0, 'Value'] = np.nan + mean_val, std_val = df['Value'].mean(), df['Value'].std() + lower_bound, upper_bound = mean_val - 3 * std_val, mean_val + 3 * std_val + df.loc[(df['Value'] < lower_bound) | (df['Value'] > upper_bound), 'Value'] = np.nan + + # 閲嶅懡鍚� Value 鍒楀苟淇濈暀闇�瑕佺殑鍒� + upstream_df = upstream_df.rename(columns={'Value': 'upstream'})[['DateTime', 'upstream']] + downstream_df = downstream_df.rename(columns={'Value': 'downstream'})[['DateTime', 'downstream']] + + + # 鍚堝苟鏁版嵁 + merged_df = pd.merge(upstream_df, downstream_df, on='DateTime', how='inner') + + + print(f"鍚堝苟鍓嶆暟鎹鏁�: {len(merged_df)}") + merged_df = merged_df.set_index('DateTime') + + # 鎻掑�硷細鍏堢敤绾挎�э紝鍐嶇敤鏃堕棿鎻掑�硷紝鏈�鍚庣敤鍓嶅悜鍚庡悜濉厖 + merged_df['upstream'] = merged_df['upstream'].interpolate(method='linear', limit=4) + merged_df['downstream'] = merged_df['downstream'].interpolate(method='linear', limit=4) + + + merged_df['upstream'] = merged_df['upstream'].interpolate(method='time', limit=24) + merged_df['downstream'] = merged_df['downstream'].interpolate(method='time', limit=24) + + merged_df['upstream'] = merged_df['upstream'].fillna(method='ffill').fillna(method='bfill') + merged_df['downstream'] = merged_df['downstream'].fillna(method='ffill').fillna(method='bfill') + + # 骞虫粦澶勭悊锛氫娇鐢ㄦ粦鍔ㄧ獥鍙gЩ鍔ㄥ钩鍧� + merged_df['upstream_smooth'] = merged_df['upstream'].rolling(window=24, min_periods=1, center=True).mean() + merged_df['downstream_smooth'] = merged_df['downstream'].rolling(window=24, min_periods=1, center=True).mean() + + # 瀵逛綆鐩愬害閮ㄥ垎鐢ㄦ洿澶х獥鍙e钩婊� + low_sal_mask = merged_df['upstream'] < 50 + if low_sal_mask.any(): + merged_df.loc[low_sal_mask, 'upstream_smooth'] = merged_df.loc[low_sal_mask, 'upstream']\ + .rolling(window=48, min_periods=1, center=True).mean() + + merged_df = merged_df.dropna() + merged_df = merged_df[merged_df['upstream'].apply(np.isfinite)] + merged_df = merged_df[merged_df['downstream'].apply(np.isfinite)] + + + merged_df = merged_df.reset_index() + print(f"娓呮礂鍚庢暟鎹鏁�: {len(merged_df)}") + print(f"涓婃父鐩愬害鑼冨洿: {merged_df['upstream'].min()} - {merged_df['upstream'].max()}") + print(f"涓嬫父鐩愬害鑼冨洿: {merged_df['downstream'].min()} - {merged_df['downstream'].max()}") + + merged_df = merged_df.sort_values('DateTime') + return merged_df + + +# 娴嬭瘯 +# df = load_data('闈掗緳娓�1.csv', '涓�鍙栨按.csv') +# df.to_csv('merged_data.csv', index=False) +# print(f"Merged data saved to 'merged_data.csv' successfully") + +# # 缁樺埗鐩愬害闅忔椂闂村彉鍖栧浘 +# plt.figure(figsize=(12, 6)) +# plt.plot(df['DateTime'], df['upstream_smooth'], label='涓婃父鐩愬害', color='blue') +# plt.plot(df['DateTime'], df['downstream_smooth'], label='涓嬫父鐩愬害', color='red') +# plt.xlabel('鏃堕棿') +# plt.ylabel('鐩愬害') +# plt.title('鐩愬害闅忔椂闂村彉鍖栧浘') +# plt.legend() +# plt.grid(True) +# plt.tight_layout() +# plt.savefig('salinity_time_series.png', dpi=300) +# plt.show() + + +# ----------------------鐗瑰緛宸ョ▼閮ㄥ垎 + + +# ------------------------------- +# 娣诲姞鍐滃巻锛堟疆姹愶級鐗瑰緛 +# ------------------------------- +def add_lunar_features(df): + lunar_day, lunar_phase_sin, lunar_phase_cos, is_high_tide = [], [], [], [] + for dt in df['DateTime']: + ld = LunarDate.fromSolarDate(dt.year, dt.month, dt.day) + lunar_day.append(ld.day) + lunar_phase_sin.append(np.sin(2 * np.pi * ld.day / 15)) + lunar_phase_cos.append(np.cos(2 * np.pi * ld.day / 15)) + is_high_tide.append(1 if (ld.day <= 5 or (ld.day >= 16 and ld.day <= 20)) else 0) + df['lunar_day'] = lunar_day + df['lunar_phase_sin'] = lunar_phase_sin + df['lunar_phase_cos'] = lunar_phase_cos + df['is_high_tide'] = is_high_tide + return df + + +# ------------------------------- +# 鐢熸垚寤惰繜鐗瑰緛锛堝悜閲忓寲锛屽埄鐢� shift锛� +# ------------------------------- +def batch_create_delay_features(df, delay_hours): + for delay in delay_hours: + df[f'upstream_delay_{delay}h'] = df['upstream_smooth'].shift(delay) + df[f'downstream_delay_{delay}h'] = df['downstream_smooth'].shift(delay) + return df + + +# ------------------------------- +# 娣诲姞鏃堕棿鐗瑰緛 +# ------------------------------- +def add_time_features(df): + df['hour'] = df['DateTime'].dt.hour + df['weekday'] = df['DateTime'].dt.dayofweek + df['month'] = df['DateTime'].dt.month + return df + + +# ------------------------------- +# 娣诲姞缁熻鐗瑰緛 +# ------------------------------- +def add_statistical_features(df): + # 1澶╃粺璁$壒寰� + df['mean_1d_up'] = df['upstream_smooth'].rolling(window=24).mean() + df['std_1d_up'] = df['upstream_smooth'].rolling(window=24).std() + df['max_1d_up'] = df['upstream_smooth'].rolling(window=24).max() + df['min_1d_up'] = df['upstream_smooth'].rolling(window=24).min() + + df['mean_1d_down'] = df['downstream_smooth'].rolling(window=24).mean() + df['std_1d_down'] = df['downstream_smooth'].rolling(window=24).std() + df['max_1d_down'] = df['downstream_smooth'].rolling(window=24).max() + df['min_1d_down'] = df['downstream_smooth'].rolling(window=24).min() + + # 3澶╃粺璁$壒寰� + df['mean_3d_up'] = df['upstream_smooth'].rolling(window=72).mean() + df['mean_3d_down'] = df['downstream_smooth'].rolling(window=72).mean() + + return df + + + +# 搴旂敤鐗瑰緛宸ョ▼骞朵繚瀛樻暟鎹� +if __name__ == "__main__": + df = load_data('闈掗緳娓�1.csv', '涓�鍙栨按.csv') + + # 娣诲姞鏃堕棿鐗瑰緛 + df = add_time_features(df) + + # 娣诲姞鍐滃巻鐗瑰緛 + df = add_lunar_features(df) + + # 娣诲姞缁熻鐗瑰緛 + df = add_statistical_features(df) + + # 娣诲姞寤惰繜鐗瑰緛 - 璁剧疆寤惰繜灏忔椂鏁颁负1,2,3,6,12,24,48,72 + delay_hours = [1, 2, 3, 6, 12, 24, 48, 72] + df = batch_create_delay_features(df, delay_hours) + + # # 淇濆瓨甯︽湁鍏ㄩ儴鐗瑰緛鐨勬暟鎹� + # df.to_csv('feature_engineered_data.csv', index=False) + # print(f"鐗瑰緛宸ョ▼鍚庣殑鏁版嵁宸蹭繚瀛樺埌 'feature_engineered_data.csv'锛屽叡{len(df)}琛岋紝{len(df.columns)}鍒�") + + # 娓呴櫎NaN鍊� + df_clean = df.dropna() + print(f"鍒犻櫎NaN鍚庣殑鏁版嵁琛屾暟: {len(df_clean)}") + + # 杩涜鐗瑰緛鐩稿叧鎬у垎鏋� + print("\n杩涜鐗瑰緛鐩稿叧鎬у垎鏋�...") + + # 閫夋嫨鏁板�煎瀷鍒楄繘琛岀浉鍏虫�у垎鏋� + numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist() + # 鎺掗櫎DateTime鍒� + if 'DateTime' in numeric_cols: + numeric_cols.remove('DateTime') + + # 璁$畻鐩稿叧鐭╅樀 + corr_matrix = df_clean[numeric_cols].corr() + + # 淇濆瓨鐩稿叧鐭╅樀鍒癈SV + corr_matrix.to_csv('feature_correlation_matrix.csv') + print("鐩稿叧鐭╅樀宸蹭繚瀛樺埌 'feature_correlation_matrix.csv'") + + # 1. 璁$畻涓庝笅娓哥洂搴�(鐩爣鍙橀噺)鐨勭浉鍏虫�� + target_corrs = corr_matrix['downstream_smooth'].sort_values(ascending=False) + target_corrs.to_csv('target_correlation.csv') + print("\n涓庝笅娓哥洂搴︽渶鐩稿叧鐨勫墠10涓壒寰�:") + print(target_corrs.head(10)) + + # 2. 缁樺埗鐩稿叧鎬х儹鍥� + plt.figure(figsize=(16, 14)) + import seaborn as sns + sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5) + plt.title('鐗瑰緛鐩稿叧鎬х儹鍥�', fontsize=16) + plt.tight_layout() + plt.savefig('correlation_heatmap.png', dpi=300) + plt.close() + print("鐩稿叧鎬х儹鍥惧凡淇濆瓨鍒� 'correlation_heatmap.png'") + + # 3. 缁樺埗涓庣洰鏍囧彉閲忕浉鍏虫�ф渶楂樼殑鍓�15涓壒寰佺殑鏉″舰鍥� + plt.figure(figsize=(12, 8)) + target_corrs.iloc[1:16].plot(kind='barh', color='skyblue') # 鎺掗櫎鑷韩鐩稿叧鎬�(=1) + plt.title('涓庝笅娓哥洂搴︾浉鍏虫�ф渶楂樼殑15涓壒寰�', fontsize=14) + plt.xlabel('鐩稿叧绯绘暟', fontsize=12) + plt.tight_layout() + plt.savefig('top_correlations.png', dpi=300) + plt.close() + print("鐩爣鐩稿叧鎬ф潯褰㈠浘宸蹭繚瀛樺埌 'top_correlations.png'") + + # 4. 妫�娴嬮珮搴︾浉鍏崇殑鐗瑰緛瀵� (鐩稿叧绯绘暟>0.9) + high_corr_pairs = [] + for i in range(len(corr_matrix.columns)): + for j in range(i): + if abs(corr_matrix.iloc[i, j]) > 0.9: + high_corr_pairs.append( + (corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]) + ) + + high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Feature1', 'Feature2', 'Correlation']) + high_corr_df = high_corr_df.sort_values('Correlation', ascending=False) + high_corr_df.to_csv('high_correlation_pairs.csv', index=False) + print(f"\n鍙戠幇{len(high_corr_pairs)}瀵归珮搴︾浉鍏崇殑鐗瑰緛瀵�(|鐩稿叧绯绘暟|>0.9)锛屽凡淇濆瓨鍒�'high_correlation_pairs.csv'") + if len(high_corr_pairs) > 0: + print("\n楂樺害鐩稿叧鐨勭壒寰佸绀轰緥:") + print(high_corr_df.head(5)) + + print("\n鐩稿叧鎬у垎鏋愬畬鎴愶紝鍙互鍩轰簬缁撴灉杩涜鐗瑰緛閫夋嫨鎴栭檷缁淬��") + + # 淇濆瓨甯︽湁鍏ㄩ儴鐗瑰緛鐨勬竻娲楀悗鏁版嵁 + df_clean.to_csv('cleaned_feature_data.csv', index=False) + print(f"\n娓呮礂鍚庣殑鐗瑰緛鏁版嵁宸蹭繚瀛樺埌 'cleaned_feature_data.csv'锛屽叡{len(df_clean)}琛岋紝{len(df_clean.columns)}鍒�") + + + + +# 鐢熸垚濂界殑鏁版嵁閫佸叆妯″瀷璁粌 + +# ------------------------------- +# 妯″瀷璁粌涓庨娴嬶紝灞曠ず楠岃瘉鍑嗙‘搴︼紙RMSE, MAE锛� +# ------------------------------- + + + -- Gitblit v1.9.3