From c252cdc14c0f96e497e80fcdaab71dbaf0b2e725 Mon Sep 17 00:00:00 2001
From: rp <rp@outlook.com>
Date: 星期三, 09 四月 2025 23:24:08 +0800
Subject: [PATCH] 特征相关性分析

---
 yd_test.py |  301 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 301 insertions(+), 0 deletions(-)

diff --git a/yd_test.py b/yd_test.py
new file mode 100644
index 0000000..d190435
--- /dev/null
+++ b/yd_test.py
@@ -0,0 +1,301 @@
+# xgboost淇敼鐗堟湰
+import os
+import pickle
+import pandas as pd
+import numpy as np
+import tkinter as tk
+import tkinter.font as tkfont
+from tkinter import ttk
+from datetime import timedelta
+from time import time
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
+from xgboost import XGBRegressor
+from lunardate import LunarDate
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+import matplotlib
+
+# 閰嶇疆 matplotlib 涓枃鏄剧ず
+matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'SimSun', 'Arial Unicode MS']
+matplotlib.rcParams['axes.unicode_minus'] = False
+matplotlib.rcParams['font.family'] = 'sans-serif'
+
+# 鍏ㄥ眬缂撳瓨鍙橀噺鍙婄壒寰佸悕绉帮紙姝ゅ feature_columns 浠呬负鍗犱綅锛�
+cached_model = None
+last_training_time = None
+feature_columns = None
+
+# 鏁版嵁鍔犺浇涓庨澶勭悊鍑芥暟
+# -------------------------------
+def load_data(upstream_file, downstream_file):
+    """
+    鍔犺浇鎵�鏈夌浉鍏虫暟鎹�
+    """
+    try:
+        upstream_df = pd.read_csv(upstream_file)
+        downstream_df = pd.read_csv(downstream_file)
+    except FileNotFoundError:
+        print("鏂囦欢鏈壘鍒帮紝璇锋鏌ヨ矾寰�")
+        return None
+
+    # 鍋囪鍘熷鏁版嵁鍒椾緷娆′负 ['DateTime', 'TagName', 'Value']
+    upstream_df.columns = ['DateTime', 'TagName', 'Value']
+    downstream_df.columns = ['DateTime', 'TagName', 'Value']
+
+   
+   
+    # 杞崲鏃堕棿鏍煎紡鍙婃暟鍊煎鐞�
+    upstream_df['DateTime'] = pd.to_datetime(upstream_df['DateTime'])
+    downstream_df['DateTime'] = pd.to_datetime(downstream_df['DateTime'])
+
+
+    # 鏁板�煎鐞�
+    upstream_df['Value'] = pd.to_numeric(upstream_df['Value'], errors='coerce')
+    downstream_df['Value'] = pd.to_numeric(downstream_df['Value'], errors='coerce')
+
+
+
+    # 杩囨护鐩愬害灏忎簬5鐨勬暟鎹�  杩欓噷鏁版嵁鍙互鏇存敼
+    upstream_df = upstream_df[upstream_df['Value'] >= 5]
+    downstream_df = downstream_df[downstream_df['Value'] >= 5]
+
+
+    # 灏�0鏇挎崲涓篘aN锛屽苟鍒╃敤3鍊嶆爣鍑嗗樊娉曞鐞嗗紓甯稿��  鏁版嵁澶勭悊骞虫粦
+    for df in [upstream_df, downstream_df]:
+        df.loc[df['Value'] == 0, 'Value'] = np.nan
+        mean_val, std_val = df['Value'].mean(), df['Value'].std()
+        lower_bound, upper_bound = mean_val - 3 * std_val, mean_val + 3 * std_val
+        df.loc[(df['Value'] < lower_bound) | (df['Value'] > upper_bound), 'Value'] = np.nan
+
+    # 閲嶅懡鍚� Value 鍒楀苟淇濈暀闇�瑕佺殑鍒�
+    upstream_df = upstream_df.rename(columns={'Value': 'upstream'})[['DateTime', 'upstream']]
+    downstream_df = downstream_df.rename(columns={'Value': 'downstream'})[['DateTime', 'downstream']]
+
+
+    # 鍚堝苟鏁版嵁
+    merged_df = pd.merge(upstream_df, downstream_df, on='DateTime', how='inner')
+
+
+    print(f"鍚堝苟鍓嶆暟鎹鏁�: {len(merged_df)}")
+    merged_df = merged_df.set_index('DateTime')
+
+    # 鎻掑�硷細鍏堢敤绾挎�э紝鍐嶇敤鏃堕棿鎻掑�硷紝鏈�鍚庣敤鍓嶅悜鍚庡悜濉厖
+    merged_df['upstream'] = merged_df['upstream'].interpolate(method='linear', limit=4)
+    merged_df['downstream'] = merged_df['downstream'].interpolate(method='linear', limit=4)
+
+
+    merged_df['upstream'] = merged_df['upstream'].interpolate(method='time', limit=24)
+    merged_df['downstream'] = merged_df['downstream'].interpolate(method='time', limit=24)
+
+    merged_df['upstream'] = merged_df['upstream'].fillna(method='ffill').fillna(method='bfill')
+    merged_df['downstream'] = merged_df['downstream'].fillna(method='ffill').fillna(method='bfill')
+
+    # 骞虫粦澶勭悊锛氫娇鐢ㄦ粦鍔ㄧ獥鍙gЩ鍔ㄥ钩鍧�
+    merged_df['upstream_smooth'] = merged_df['upstream'].rolling(window=24, min_periods=1, center=True).mean()
+    merged_df['downstream_smooth'] = merged_df['downstream'].rolling(window=24, min_periods=1, center=True).mean()
+
+    # 瀵逛綆鐩愬害閮ㄥ垎鐢ㄦ洿澶х獥鍙e钩婊�
+    low_sal_mask = merged_df['upstream'] < 50
+    if low_sal_mask.any():
+        merged_df.loc[low_sal_mask, 'upstream_smooth'] = merged_df.loc[low_sal_mask, 'upstream']\
+            .rolling(window=48, min_periods=1, center=True).mean()
+
+    merged_df = merged_df.dropna()
+    merged_df = merged_df[merged_df['upstream'].apply(np.isfinite)]
+    merged_df = merged_df[merged_df['downstream'].apply(np.isfinite)]
+
+    
+    merged_df = merged_df.reset_index()
+    print(f"娓呮礂鍚庢暟鎹鏁�: {len(merged_df)}")
+    print(f"涓婃父鐩愬害鑼冨洿: {merged_df['upstream'].min()} - {merged_df['upstream'].max()}")
+    print(f"涓嬫父鐩愬害鑼冨洿: {merged_df['downstream'].min()} - {merged_df['downstream'].max()}")
+
+    merged_df = merged_df.sort_values('DateTime')
+    return merged_df
+
+
+# 娴嬭瘯
+# df = load_data('闈掗緳娓�1.csv', '涓�鍙栨按.csv')
+# df.to_csv('merged_data.csv', index=False)
+# print(f"Merged data saved to 'merged_data.csv' successfully")
+
+# # 缁樺埗鐩愬害闅忔椂闂村彉鍖栧浘
+# plt.figure(figsize=(12, 6))
+# plt.plot(df['DateTime'], df['upstream_smooth'], label='涓婃父鐩愬害', color='blue')
+# plt.plot(df['DateTime'], df['downstream_smooth'], label='涓嬫父鐩愬害', color='red')
+# plt.xlabel('鏃堕棿')
+# plt.ylabel('鐩愬害')
+# plt.title('鐩愬害闅忔椂闂村彉鍖栧浘')
+# plt.legend()
+# plt.grid(True)
+# plt.tight_layout()
+# plt.savefig('salinity_time_series.png', dpi=300)
+# plt.show()
+
+
+# ----------------------鐗瑰緛宸ョ▼閮ㄥ垎
+
+
+# -------------------------------
+# 娣诲姞鍐滃巻锛堟疆姹愶級鐗瑰緛
+# -------------------------------
+def add_lunar_features(df):
+    lunar_day, lunar_phase_sin, lunar_phase_cos, is_high_tide = [], [], [], []
+    for dt in df['DateTime']:
+        ld = LunarDate.fromSolarDate(dt.year, dt.month, dt.day)
+        lunar_day.append(ld.day)
+        lunar_phase_sin.append(np.sin(2 * np.pi * ld.day / 15))
+        lunar_phase_cos.append(np.cos(2 * np.pi * ld.day / 15))
+        is_high_tide.append(1 if (ld.day <= 5 or (ld.day >= 16 and ld.day <= 20)) else 0)
+    df['lunar_day'] = lunar_day
+    df['lunar_phase_sin'] = lunar_phase_sin
+    df['lunar_phase_cos'] = lunar_phase_cos
+    df['is_high_tide'] = is_high_tide
+    return df
+
+
+# -------------------------------
+# 鐢熸垚寤惰繜鐗瑰緛锛堝悜閲忓寲锛屽埄鐢� shift锛�
+# -------------------------------
+def batch_create_delay_features(df, delay_hours):
+    for delay in delay_hours:
+        df[f'upstream_delay_{delay}h'] = df['upstream_smooth'].shift(delay)
+        df[f'downstream_delay_{delay}h'] = df['downstream_smooth'].shift(delay)
+    return df
+
+
+# -------------------------------
+# 娣诲姞鏃堕棿鐗瑰緛
+# -------------------------------
+def add_time_features(df):
+    df['hour'] = df['DateTime'].dt.hour
+    df['weekday'] = df['DateTime'].dt.dayofweek
+    df['month'] = df['DateTime'].dt.month
+    return df
+
+
+# -------------------------------
+# 娣诲姞缁熻鐗瑰緛
+# -------------------------------
+def add_statistical_features(df):
+    # 1澶╃粺璁$壒寰�
+    df['mean_1d_up'] = df['upstream_smooth'].rolling(window=24).mean()
+    df['std_1d_up'] = df['upstream_smooth'].rolling(window=24).std()
+    df['max_1d_up'] = df['upstream_smooth'].rolling(window=24).max()
+    df['min_1d_up'] = df['upstream_smooth'].rolling(window=24).min()
+    
+    df['mean_1d_down'] = df['downstream_smooth'].rolling(window=24).mean()
+    df['std_1d_down'] = df['downstream_smooth'].rolling(window=24).std()
+    df['max_1d_down'] = df['downstream_smooth'].rolling(window=24).max()
+    df['min_1d_down'] = df['downstream_smooth'].rolling(window=24).min()
+    
+    # 3澶╃粺璁$壒寰�
+    df['mean_3d_up'] = df['upstream_smooth'].rolling(window=72).mean()
+    df['mean_3d_down'] = df['downstream_smooth'].rolling(window=72).mean()
+    
+    return df
+
+
+
+# 搴旂敤鐗瑰緛宸ョ▼骞朵繚瀛樻暟鎹�
+if __name__ == "__main__":
+    df = load_data('闈掗緳娓�1.csv', '涓�鍙栨按.csv')
+    
+    # 娣诲姞鏃堕棿鐗瑰緛
+    df = add_time_features(df)
+    
+    # 娣诲姞鍐滃巻鐗瑰緛
+    df = add_lunar_features(df)
+    
+    # 娣诲姞缁熻鐗瑰緛
+    df = add_statistical_features(df)
+    
+    # 娣诲姞寤惰繜鐗瑰緛 - 璁剧疆寤惰繜灏忔椂鏁颁负1,2,3,6,12,24,48,72
+    delay_hours = [1, 2, 3, 6, 12, 24, 48, 72]
+    df = batch_create_delay_features(df, delay_hours)
+    
+    # # 淇濆瓨甯︽湁鍏ㄩ儴鐗瑰緛鐨勬暟鎹�
+    # df.to_csv('feature_engineered_data.csv', index=False)
+    # print(f"鐗瑰緛宸ョ▼鍚庣殑鏁版嵁宸蹭繚瀛樺埌 'feature_engineered_data.csv'锛屽叡{len(df)}琛岋紝{len(df.columns)}鍒�")
+    
+    # 娓呴櫎NaN鍊�
+    df_clean = df.dropna()
+    print(f"鍒犻櫎NaN鍚庣殑鏁版嵁琛屾暟: {len(df_clean)}")
+    
+    # 杩涜鐗瑰緛鐩稿叧鎬у垎鏋�
+    print("\n杩涜鐗瑰緛鐩稿叧鎬у垎鏋�...")
+    
+    # 閫夋嫨鏁板�煎瀷鍒楄繘琛岀浉鍏虫�у垎鏋�
+    numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
+    # 鎺掗櫎DateTime鍒�
+    if 'DateTime' in numeric_cols:
+        numeric_cols.remove('DateTime')
+    
+    # 璁$畻鐩稿叧鐭╅樀
+    corr_matrix = df_clean[numeric_cols].corr()
+    
+    # 淇濆瓨鐩稿叧鐭╅樀鍒癈SV
+    corr_matrix.to_csv('feature_correlation_matrix.csv')
+    print("鐩稿叧鐭╅樀宸蹭繚瀛樺埌 'feature_correlation_matrix.csv'")
+    
+    # 1. 璁$畻涓庝笅娓哥洂搴�(鐩爣鍙橀噺)鐨勭浉鍏虫��
+    target_corrs = corr_matrix['downstream_smooth'].sort_values(ascending=False)
+    target_corrs.to_csv('target_correlation.csv')
+    print("\n涓庝笅娓哥洂搴︽渶鐩稿叧鐨勫墠10涓壒寰�:")
+    print(target_corrs.head(10))
+    
+    # 2. 缁樺埗鐩稿叧鎬х儹鍥�
+    plt.figure(figsize=(16, 14))
+    import seaborn as sns
+    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
+    plt.title('鐗瑰緛鐩稿叧鎬х儹鍥�', fontsize=16)
+    plt.tight_layout()
+    plt.savefig('correlation_heatmap.png', dpi=300)
+    plt.close()
+    print("鐩稿叧鎬х儹鍥惧凡淇濆瓨鍒� 'correlation_heatmap.png'")
+    
+    # 3. 缁樺埗涓庣洰鏍囧彉閲忕浉鍏虫�ф渶楂樼殑鍓�15涓壒寰佺殑鏉″舰鍥�
+    plt.figure(figsize=(12, 8))
+    target_corrs.iloc[1:16].plot(kind='barh', color='skyblue')  # 鎺掗櫎鑷韩鐩稿叧鎬�(=1)
+    plt.title('涓庝笅娓哥洂搴︾浉鍏虫�ф渶楂樼殑15涓壒寰�', fontsize=14)
+    plt.xlabel('鐩稿叧绯绘暟', fontsize=12)
+    plt.tight_layout()
+    plt.savefig('top_correlations.png', dpi=300)
+    plt.close()
+    print("鐩爣鐩稿叧鎬ф潯褰㈠浘宸蹭繚瀛樺埌 'top_correlations.png'")
+    
+    # 4. 妫�娴嬮珮搴︾浉鍏崇殑鐗瑰緛瀵� (鐩稿叧绯绘暟>0.9)
+    high_corr_pairs = []
+    for i in range(len(corr_matrix.columns)):
+        for j in range(i):
+            if abs(corr_matrix.iloc[i, j]) > 0.9:
+                high_corr_pairs.append(
+                    (corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
+                )
+    
+    high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Feature1', 'Feature2', 'Correlation'])
+    high_corr_df = high_corr_df.sort_values('Correlation', ascending=False)
+    high_corr_df.to_csv('high_correlation_pairs.csv', index=False)
+    print(f"\n鍙戠幇{len(high_corr_pairs)}瀵归珮搴︾浉鍏崇殑鐗瑰緛瀵�(|鐩稿叧绯绘暟|>0.9)锛屽凡淇濆瓨鍒�'high_correlation_pairs.csv'")
+    if len(high_corr_pairs) > 0:
+        print("\n楂樺害鐩稿叧鐨勭壒寰佸绀轰緥:")
+        print(high_corr_df.head(5))
+    
+    print("\n鐩稿叧鎬у垎鏋愬畬鎴愶紝鍙互鍩轰簬缁撴灉杩涜鐗瑰緛閫夋嫨鎴栭檷缁淬��")
+    
+    # 淇濆瓨甯︽湁鍏ㄩ儴鐗瑰緛鐨勬竻娲楀悗鏁版嵁
+    df_clean.to_csv('cleaned_feature_data.csv', index=False)
+    print(f"\n娓呮礂鍚庣殑鐗瑰緛鏁版嵁宸蹭繚瀛樺埌 'cleaned_feature_data.csv'锛屽叡{len(df_clean)}琛岋紝{len(df_clean.columns)}鍒�")
+
+
+
+
+# 鐢熸垚濂界殑鏁版嵁閫佸叆妯″瀷璁粌
+
+# -------------------------------
+# 妯″瀷璁粌涓庨娴嬶紝灞曠ず楠岃瘉鍑嗙‘搴︼紙RMSE, MAE锛�
+# -------------------------------
+
+
+

--
Gitblit v1.9.3