import matplotlib.pyplot as plt
|
import numpy as np
|
import matplotlib
|
import pandas as pd
|
from datetime import datetime, timedelta
|
from scipy.signal import savgol_filter
|
|
# 设置中文字体支持
|
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
|
matplotlib.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
|
|
# 定义改进的盐度数据异常过滤方法
|
def filter_salinity_anomalies(df, threshold_ratio=0.5, window_size=5, max_days=1):
|
# 复制数据,避免修改原始数据
|
filtered_df = df.copy()
|
values = filtered_df['Value'].values
|
dates = filtered_df['DateTime'].values
|
|
# 1. 首先处理单个异常点
|
i = 1
|
while i < len(values):
|
# 检查当前值是否小于前一个值的threshold_ratio
|
if values[i] < values[i-1] * threshold_ratio:
|
baseline = values[i-1] # 基准值为上一个正常的盐度值
|
anomaly_start = i
|
j = i
|
|
# 向后查找,直到找到一个不小于基准值threshold_ratio的点
|
# 或者直到时间区间超过max_days天
|
anomaly_start_date = dates[anomaly_start]
|
max_date = anomaly_start_date + np.timedelta64(int(max_days*24), 'h')
|
|
while j < len(values) and values[j] < baseline * threshold_ratio and dates[j] <= max_date:
|
j += 1
|
|
anomaly_end = j - 1 # 异常区间的结束位置
|
|
# 处理异常区间
|
if anomaly_end - anomaly_start < 3: # 短区间用线性插值
|
if j < len(values):
|
# 如果异常区间后还有数据点,使用线性插值
|
for k in range(anomaly_start, anomaly_end + 1):
|
# 线性插值:在基准值和异常区间后第一个正常值之间进行平滑过渡
|
ratio = (k - anomaly_start + 1) / (anomaly_end - anomaly_start + 2)
|
values[k] = baseline * (1 - ratio) + values[j] * ratio
|
# 确保平滑后的值不低于基准的threshold_ratio
|
values[k] = max(values[k], baseline * threshold_ratio)
|
else:
|
# 如果异常区间到数据末尾,使用基准值的threshold_ratio填充
|
for k in range(anomaly_start, anomaly_end + 1):
|
values[k] = baseline * threshold_ratio
|
else: # 长区间使用更简单的平滑方式,避免插值错误
|
# 使用线性插值来避免非有限值问题
|
if j < len(values):
|
end_val = values[j]
|
# 为每个点创建线性插值
|
for k in range(anomaly_start, anomaly_end + 1):
|
fraction = (k - anomaly_start) / (j - anomaly_start) if j > anomaly_start else 0
|
interpolated = baseline * (1 - fraction) + end_val * fraction
|
values[k] = max(interpolated, baseline * threshold_ratio)
|
else:
|
# 如果异常区间到数据末尾,使用基准值的threshold_ratio填充
|
for k in range(anomaly_start, anomaly_end + 1):
|
values[k] = baseline * threshold_ratio
|
|
i = j # 跳过已处理的异常区间
|
else:
|
i += 1
|
|
# 2. 应用Savitzky-Golay滤波进行整体平滑
|
if len(values) > window_size:
|
# 确保window_size是奇数
|
if window_size % 2 == 0:
|
window_size += 1
|
|
# 应用Savitzky-Golay滤波
|
try:
|
# 对数据进行平滑,但保留原始的特性
|
smoothed = savgol_filter(values, window_size, 3)
|
|
# 确保平滑后的数据不会小于相邻点的threshold_ratio
|
for i in range(1, len(smoothed)):
|
smoothed[i] = max(smoothed[i], smoothed[i-1] * threshold_ratio)
|
|
values = smoothed
|
except Exception as e:
|
print(f"Savitzky-Golay滤波应用失败: {e}")
|
|
filtered_df['Value'] = values
|
return filtered_df
|
|
# 读取CSV数据
|
df1 = pd.read_csv('yuce_data/一取水.csv')
|
df2 = pd.read_csv('yuce_data/青龙港1.csv')
|
df3 = pd.read_csv('yuce_data/太仓石化盐度2.csv')
|
|
# 将时间列转换为datetime格式
|
df1['DateTime'] = pd.to_datetime(df1['DateTime'])
|
df2['DateTime'] = pd.to_datetime(df2['DateTime'])
|
df3['DateTime'] = pd.to_datetime(df3['DateTime'])
|
|
# 应用盐度数据异常过滤方法
|
filtered_df1 = filter_salinity_anomalies(df1, threshold_ratio=0.5, window_size=7, max_days=1)
|
filtered_df2 = filter_salinity_anomalies(df2, threshold_ratio=0.5, window_size=7, max_days=1)
|
filtered_df3 = filter_salinity_anomalies(df3, threshold_ratio=0.5, window_size=7, max_days=1)
|
|
# 创建图表,调整为4个子图
|
plt.figure(figsize=(16, 16))
|
|
# 绘制一取水的原始数据和过滤后的数据
|
plt.subplot(4, 1, 1)
|
plt.plot(df1['DateTime'], df1['Value'], 'o-', color='lightblue', linewidth=1, markersize=2, alpha=0.6, label='一取水(原始)')
|
plt.plot(filtered_df1['DateTime'], filtered_df1['Value'], 'o-', color='blue', linewidth=1.5, markersize=3, label='一取水(过滤后)')
|
plt.title('一取水盐度监测数据异常过滤', fontsize=16)
|
plt.ylabel('盐度值', fontsize=14)
|
plt.legend(loc='best', fontsize=10)
|
plt.grid(True, linestyle='--', alpha=0.7)
|
|
# 绘制青龙港1的原始数据和过滤后的数据
|
plt.subplot(4, 1, 2)
|
plt.plot(df2['DateTime'], df2['Value'], 's-', color='lightcoral', linewidth=1, markersize=2, alpha=0.6, label='青龙港1(原始)')
|
plt.plot(filtered_df2['DateTime'], filtered_df2['Value'], 's-', color='red', linewidth=1.5, markersize=3, label='青龙港1(过滤后)')
|
plt.title('青龙港1盐度监测数据异常过滤', fontsize=16)
|
plt.ylabel('盐度值', fontsize=14)
|
plt.legend(loc='best', fontsize=10)
|
plt.grid(True, linestyle='--', alpha=0.7)
|
|
# 绘制太仓石化盐度1的原始数据和过滤后的数据
|
plt.subplot(4, 1, 3)
|
plt.plot(df3['DateTime'], df3['Value'], '^-', color='lightgreen', linewidth=1, markersize=2, alpha=0.6, label='太仓石化1(原始)')
|
plt.plot(filtered_df3['DateTime'], filtered_df3['Value'], '^-', color='green', linewidth=1.5, markersize=3, label='太仓石化1(过滤后)')
|
plt.title('太仓石化盐度1监测数据异常过滤', fontsize=16)
|
plt.ylabel('盐度值', fontsize=14)
|
plt.legend(loc='best', fontsize=10)
|
plt.grid(True, linestyle='--', alpha=0.7)
|
|
# 计算过滤前后的差异
|
df1['Difference'] = filtered_df1['Value'] - df1['Value']
|
df2['Difference'] = filtered_df2['Value'] - df2['Value']
|
df3['Difference'] = filtered_df3['Value'] - df3['Value']
|
|
# 绘制差异
|
plt.subplot(4, 1, 4)
|
plt.plot(df1['DateTime'], df1['Difference'], 'o-', color='blue', linewidth=1.5, markersize=3, label='一取水(差异)')
|
plt.plot(df2['DateTime'], df2['Difference'], 's-', color='red', linewidth=1.5, markersize=3, label='青龙港1(差异)')
|
plt.plot(df3['DateTime'], df3['Difference'], '^-', color='green', linewidth=1.5, markersize=3, label='太仓石化1(差异)')
|
plt.title('数据处理前后差异', fontsize=16)
|
plt.xlabel('时间', fontsize=14)
|
plt.ylabel('差异值', fontsize=14)
|
plt.legend(loc='best', fontsize=10)
|
plt.grid(True, linestyle='--', alpha=0.7)
|
|
# 格式化x轴日期显示
|
for ax in plt.gcf().get_axes():
|
ax.xaxis_date()
|
plt.gcf().autofmt_xdate()
|
|
# 显示图表
|
plt.tight_layout()
|
plt.show()
|
|
# 保存过滤后的数据
|
filtered_df1.to_csv('yuce_data/一取水_filtered.csv', index=False)
|
filtered_df2.to_csv('yuce_data/青龙港1_filtered.csv', index=False)
|
filtered_df3.to_csv('yuce_data/太仓石化盐度1_filtered.csv', index=False)
|
|
# 打印过滤前后的统计信息及处理区间
|
print("\n数据过滤前后统计信息:")
|
print("\n一取水:")
|
print(f"原始数据 - 最小值: {df1['Value'].min():.2f}, 最大值: {df1['Value'].max():.2f}, 均值: {df1['Value'].mean():.2f}, 标准差: {df1['Value'].std():.2f}")
|
print(f"过滤后数据 - 最小值: {filtered_df1['Value'].min():.2f}, 最大值: {filtered_df1['Value'].max():.2f}, 均值: {filtered_df1['Value'].mean():.2f}, 标准差: {filtered_df1['Value'].std():.2f}")
|
|
print("\n青龙港1:")
|
print(f"原始数据 - 最小值: {df2['Value'].min():.2f}, 最大值: {df2['Value'].max():.2f}, 均值: {df2['Value'].mean():.2f}, 标准差: {df2['Value'].std():.2f}")
|
print(f"过滤后数据 - 最小值: {filtered_df2['Value'].min():.2f}, 最大值: {filtered_df2['Value'].max():.2f}, 均值: {filtered_df2['Value'].mean():.2f}, 标准差: {filtered_df2['Value'].std():.2f}")
|
|
print("\n太仓石化盐度1:")
|
print(f"原始数据 - 最小值: {df3['Value'].min():.2f}, 最大值: {df3['Value'].max():.2f}, 均值: {df3['Value'].mean():.2f}, 标准差: {df3['Value'].std():.2f}")
|
print(f"过滤后数据 - 最小值: {filtered_df3['Value'].min():.2f}, 最大值: {filtered_df3['Value'].max():.2f}, 均值: {filtered_df3['Value'].mean():.2f}, 标准差: {filtered_df3['Value'].std():.2f}")
|
|
# 找出异常点,计算处理的区间数量和时间跨度
|
def get_anomaly_stats(df_orig, df_filtered):
|
diff = abs(df_filtered['Value'] - df_orig['Value'])
|
anomalies = df_orig[diff > 0.01] # 差异大于0.01的点认为是异常点
|
|
if len(anomalies) == 0:
|
return 0, "无异常区间"
|
|
# 计算连续的异常区间
|
anomaly_indices = anomalies.index.tolist()
|
intervals = []
|
start_idx = anomaly_indices[0]
|
prev_idx = start_idx
|
|
for idx in anomaly_indices[1:]:
|
if idx > prev_idx + 1: # 不连续
|
intervals.append((start_idx, prev_idx))
|
start_idx = idx
|
prev_idx = idx
|
|
intervals.append((start_idx, prev_idx)) # 添加最后一个区间
|
|
# 计算区间的时间跨度
|
time_spans = []
|
for start, end in intervals:
|
if start == end:
|
span = "单点"
|
else:
|
delta = df_orig.iloc[end]['DateTime'] - df_orig.iloc[start]['DateTime']
|
hours = delta.total_seconds() / 3600
|
if hours < 1:
|
span = f"{int(hours * 60)}分钟"
|
elif hours < 24:
|
span = f"{int(hours)}小时"
|
else:
|
span = f"{int(hours/24)}天{int(hours%24)}小时"
|
time_spans.append(span)
|
|
return len(intervals), time_spans
|
|
anomaly_count1, time_spans1 = get_anomaly_stats(df1, filtered_df1)
|
anomaly_count2, time_spans2 = get_anomaly_stats(df2, filtered_df2)
|
anomaly_count3, time_spans3 = get_anomaly_stats(df3, filtered_df3)
|
|
print("\n异常区间统计:")
|
print(f"一取水 - 异常区间数量: {anomaly_count1}")
|
if anomaly_count1 > 0:
|
print(f"区间时间跨度: {time_spans1}")
|
|
print(f"青龙港1 - 异常区间数量: {anomaly_count2}")
|
if anomaly_count2 > 0:
|
print(f"区间时间跨度: {time_spans2}")
|
|
print(f"太仓石化盐度1 - 异常区间数量: {anomaly_count3}")
|
if anomaly_count3 > 0:
|
print(f"区间时间跨度: {time_spans3}")
|