import matplotlib.pyplot as plt import numpy as np import matplotlib import pandas as pd from datetime import datetime, timedelta from scipy.signal import savgol_filter # 设置中文字体支持 matplotlib.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 matplotlib.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 # 定义改进的盐度数据异常过滤方法 def filter_salinity_anomalies(df, threshold_ratio=0.5, window_size=5, max_days=1): # 复制数据,避免修改原始数据 filtered_df = df.copy() values = filtered_df['Value'].values dates = filtered_df['DateTime'].values # 1. 首先处理单个异常点 i = 1 while i < len(values): # 检查当前值是否小于前一个值的threshold_ratio if values[i] < values[i-1] * threshold_ratio: baseline = values[i-1] # 基准值为上一个正常的盐度值 anomaly_start = i j = i # 向后查找,直到找到一个不小于基准值threshold_ratio的点 # 或者直到时间区间超过max_days天 anomaly_start_date = dates[anomaly_start] max_date = anomaly_start_date + np.timedelta64(int(max_days*24), 'h') while j < len(values) and values[j] < baseline * threshold_ratio and dates[j] <= max_date: j += 1 anomaly_end = j - 1 # 异常区间的结束位置 # 处理异常区间 if anomaly_end - anomaly_start < 3: # 短区间用线性插值 if j < len(values): # 如果异常区间后还有数据点,使用线性插值 for k in range(anomaly_start, anomaly_end + 1): # 线性插值:在基准值和异常区间后第一个正常值之间进行平滑过渡 ratio = (k - anomaly_start + 1) / (anomaly_end - anomaly_start + 2) values[k] = baseline * (1 - ratio) + values[j] * ratio # 确保平滑后的值不低于基准的threshold_ratio values[k] = max(values[k], baseline * threshold_ratio) else: # 如果异常区间到数据末尾,使用基准值的threshold_ratio填充 for k in range(anomaly_start, anomaly_end + 1): values[k] = baseline * threshold_ratio else: # 长区间使用更简单的平滑方式,避免插值错误 # 使用线性插值来避免非有限值问题 if j < len(values): end_val = values[j] # 为每个点创建线性插值 for k in range(anomaly_start, anomaly_end + 1): fraction = (k - anomaly_start) / (j - anomaly_start) if j > anomaly_start else 0 interpolated = baseline * (1 - fraction) + end_val * fraction values[k] = max(interpolated, baseline * threshold_ratio) else: # 如果异常区间到数据末尾,使用基准值的threshold_ratio填充 for k in range(anomaly_start, anomaly_end + 1): values[k] = baseline * threshold_ratio i = j # 跳过已处理的异常区间 else: i += 1 # 2. 应用Savitzky-Golay滤波进行整体平滑 if len(values) > window_size: # 确保window_size是奇数 if window_size % 2 == 0: window_size += 1 # 应用Savitzky-Golay滤波 try: # 对数据进行平滑,但保留原始的特性 smoothed = savgol_filter(values, window_size, 3) # 确保平滑后的数据不会小于相邻点的threshold_ratio for i in range(1, len(smoothed)): smoothed[i] = max(smoothed[i], smoothed[i-1] * threshold_ratio) values = smoothed except Exception as e: print(f"Savitzky-Golay滤波应用失败: {e}") filtered_df['Value'] = values return filtered_df # 读取CSV数据 df1 = pd.read_csv('yuce_data/一取水.csv') df2 = pd.read_csv('yuce_data/青龙港1.csv') df3 = pd.read_csv('yuce_data/太仓石化盐度2.csv') # 将时间列转换为datetime格式 df1['DateTime'] = pd.to_datetime(df1['DateTime']) df2['DateTime'] = pd.to_datetime(df2['DateTime']) df3['DateTime'] = pd.to_datetime(df3['DateTime']) # 应用盐度数据异常过滤方法 filtered_df1 = filter_salinity_anomalies(df1, threshold_ratio=0.5, window_size=7, max_days=1) filtered_df2 = filter_salinity_anomalies(df2, threshold_ratio=0.5, window_size=7, max_days=1) filtered_df3 = filter_salinity_anomalies(df3, threshold_ratio=0.5, window_size=7, max_days=1) # 创建图表,调整为4个子图 plt.figure(figsize=(16, 16)) # 绘制一取水的原始数据和过滤后的数据 plt.subplot(4, 1, 1) plt.plot(df1['DateTime'], df1['Value'], 'o-', color='lightblue', linewidth=1, markersize=2, alpha=0.6, label='一取水(原始)') plt.plot(filtered_df1['DateTime'], filtered_df1['Value'], 'o-', color='blue', linewidth=1.5, markersize=3, label='一取水(过滤后)') plt.title('一取水盐度监测数据异常过滤', fontsize=16) plt.ylabel('盐度值', fontsize=14) plt.legend(loc='best', fontsize=10) plt.grid(True, linestyle='--', alpha=0.7) # 绘制青龙港1的原始数据和过滤后的数据 plt.subplot(4, 1, 2) plt.plot(df2['DateTime'], df2['Value'], 's-', color='lightcoral', linewidth=1, markersize=2, alpha=0.6, label='青龙港1(原始)') plt.plot(filtered_df2['DateTime'], filtered_df2['Value'], 's-', color='red', linewidth=1.5, markersize=3, label='青龙港1(过滤后)') plt.title('青龙港1盐度监测数据异常过滤', fontsize=16) plt.ylabel('盐度值', fontsize=14) plt.legend(loc='best', fontsize=10) plt.grid(True, linestyle='--', alpha=0.7) # 绘制太仓石化盐度1的原始数据和过滤后的数据 plt.subplot(4, 1, 3) plt.plot(df3['DateTime'], df3['Value'], '^-', color='lightgreen', linewidth=1, markersize=2, alpha=0.6, label='太仓石化1(原始)') plt.plot(filtered_df3['DateTime'], filtered_df3['Value'], '^-', color='green', linewidth=1.5, markersize=3, label='太仓石化1(过滤后)') plt.title('太仓石化盐度1监测数据异常过滤', fontsize=16) plt.ylabel('盐度值', fontsize=14) plt.legend(loc='best', fontsize=10) plt.grid(True, linestyle='--', alpha=0.7) # 计算过滤前后的差异 df1['Difference'] = filtered_df1['Value'] - df1['Value'] df2['Difference'] = filtered_df2['Value'] - df2['Value'] df3['Difference'] = filtered_df3['Value'] - df3['Value'] # 绘制差异 plt.subplot(4, 1, 4) plt.plot(df1['DateTime'], df1['Difference'], 'o-', color='blue', linewidth=1.5, markersize=3, label='一取水(差异)') plt.plot(df2['DateTime'], df2['Difference'], 's-', color='red', linewidth=1.5, markersize=3, label='青龙港1(差异)') plt.plot(df3['DateTime'], df3['Difference'], '^-', color='green', linewidth=1.5, markersize=3, label='太仓石化1(差异)') plt.title('数据处理前后差异', fontsize=16) plt.xlabel('时间', fontsize=14) plt.ylabel('差异值', fontsize=14) plt.legend(loc='best', fontsize=10) plt.grid(True, linestyle='--', alpha=0.7) # 格式化x轴日期显示 for ax in plt.gcf().get_axes(): ax.xaxis_date() plt.gcf().autofmt_xdate() # 显示图表 plt.tight_layout() plt.show() # 保存过滤后的数据 filtered_df1.to_csv('yuce_data/一取水_filtered.csv', index=False) filtered_df2.to_csv('yuce_data/青龙港1_filtered.csv', index=False) filtered_df3.to_csv('yuce_data/太仓石化盐度1_filtered.csv', index=False) # 打印过滤前后的统计信息及处理区间 print("\n数据过滤前后统计信息:") print("\n一取水:") print(f"原始数据 - 最小值: {df1['Value'].min():.2f}, 最大值: {df1['Value'].max():.2f}, 均值: {df1['Value'].mean():.2f}, 标准差: {df1['Value'].std():.2f}") print(f"过滤后数据 - 最小值: {filtered_df1['Value'].min():.2f}, 最大值: {filtered_df1['Value'].max():.2f}, 均值: {filtered_df1['Value'].mean():.2f}, 标准差: {filtered_df1['Value'].std():.2f}") print("\n青龙港1:") print(f"原始数据 - 最小值: {df2['Value'].min():.2f}, 最大值: {df2['Value'].max():.2f}, 均值: {df2['Value'].mean():.2f}, 标准差: {df2['Value'].std():.2f}") print(f"过滤后数据 - 最小值: {filtered_df2['Value'].min():.2f}, 最大值: {filtered_df2['Value'].max():.2f}, 均值: {filtered_df2['Value'].mean():.2f}, 标准差: {filtered_df2['Value'].std():.2f}") print("\n太仓石化盐度1:") print(f"原始数据 - 最小值: {df3['Value'].min():.2f}, 最大值: {df3['Value'].max():.2f}, 均值: {df3['Value'].mean():.2f}, 标准差: {df3['Value'].std():.2f}") print(f"过滤后数据 - 最小值: {filtered_df3['Value'].min():.2f}, 最大值: {filtered_df3['Value'].max():.2f}, 均值: {filtered_df3['Value'].mean():.2f}, 标准差: {filtered_df3['Value'].std():.2f}") # 找出异常点,计算处理的区间数量和时间跨度 def get_anomaly_stats(df_orig, df_filtered): diff = abs(df_filtered['Value'] - df_orig['Value']) anomalies = df_orig[diff > 0.01] # 差异大于0.01的点认为是异常点 if len(anomalies) == 0: return 0, "无异常区间" # 计算连续的异常区间 anomaly_indices = anomalies.index.tolist() intervals = [] start_idx = anomaly_indices[0] prev_idx = start_idx for idx in anomaly_indices[1:]: if idx > prev_idx + 1: # 不连续 intervals.append((start_idx, prev_idx)) start_idx = idx prev_idx = idx intervals.append((start_idx, prev_idx)) # 添加最后一个区间 # 计算区间的时间跨度 time_spans = [] for start, end in intervals: if start == end: span = "单点" else: delta = df_orig.iloc[end]['DateTime'] - df_orig.iloc[start]['DateTime'] hours = delta.total_seconds() / 3600 if hours < 1: span = f"{int(hours * 60)}分钟" elif hours < 24: span = f"{int(hours)}小时" else: span = f"{int(hours/24)}天{int(hours%24)}小时" time_spans.append(span) return len(intervals), time_spans anomaly_count1, time_spans1 = get_anomaly_stats(df1, filtered_df1) anomaly_count2, time_spans2 = get_anomaly_stats(df2, filtered_df2) anomaly_count3, time_spans3 = get_anomaly_stats(df3, filtered_df3) print("\n异常区间统计:") print(f"一取水 - 异常区间数量: {anomaly_count1}") if anomaly_count1 > 0: print(f"区间时间跨度: {time_spans1}") print(f"青龙港1 - 异常区间数量: {anomaly_count2}") if anomaly_count2 > 0: print(f"区间时间跨度: {time_spans2}") print(f"太仓石化盐度1 - 异常区间数量: {anomaly_count3}") if anomaly_count3 > 0: print(f"区间时间跨度: {time_spans3}")