IStation/shys/ChXcAi.git

import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import pandas as pd
from datetime import datetime, timedelta
from scipy.signal import savgol_filter
 
# 设置中文字体支持
matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
matplotlib.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
 
# 定义改进的盐度数据异常过滤方法
def filter_salinity_anomalies(df, threshold_ratio=0.5, window_size=5, max_days=1):
    # 复制数据，避免修改原始数据
    filtered_df = df.copy()
    values = filtered_df['Value'].values
    dates = filtered_df['DateTime'].values
    
    # 1. 首先处理单个异常点
    i = 1
    while i < len(values):
        # 检查当前值是否小于前一个值的threshold_ratio
        if values[i] < values[i-1] * threshold_ratio:
            baseline = values[i-1]  # 基准值为上一个正常的盐度值
            anomaly_start = i
            j = i
            
            # 向后查找，直到找到一个不小于基准值threshold_ratio的点
            # 或者直到时间区间超过max_days天
            anomaly_start_date = dates[anomaly_start]
            max_date = anomaly_start_date + np.timedelta64(int(max_days*24), 'h')
            
            while j < len(values) and values[j] < baseline * threshold_ratio and dates[j] <= max_date:
                j += 1
            
            anomaly_end = j - 1  # 异常区间的结束位置
            
            # 处理异常区间
            if anomaly_end - anomaly_start < 3:  # 短区间用线性插值
                if j < len(values):
                    # 如果异常区间后还有数据点，使用线性插值
                    for k in range(anomaly_start, anomaly_end + 1):
                        # 线性插值：在基准值和异常区间后第一个正常值之间进行平滑过渡
                        ratio = (k - anomaly_start + 1) / (anomaly_end - anomaly_start + 2)
                        values[k] = baseline * (1 - ratio) + values[j] * ratio
                        # 确保平滑后的值不低于基准的threshold_ratio
                        values[k] = max(values[k], baseline * threshold_ratio)
                else:
                    # 如果异常区间到数据末尾，使用基准值的threshold_ratio填充
                    for k in range(anomaly_start, anomaly_end + 1):
                        values[k] = baseline * threshold_ratio
            else:  # 长区间使用更简单的平滑方式，避免插值错误
                # 使用线性插值来避免非有限值问题
                if j < len(values):
                    end_val = values[j]
                    # 为每个点创建线性插值
                    for k in range(anomaly_start, anomaly_end + 1):
                        fraction = (k - anomaly_start) / (j - anomaly_start) if j > anomaly_start else 0
                        interpolated = baseline * (1 - fraction) + end_val * fraction
                        values[k] = max(interpolated, baseline * threshold_ratio)
                else:
                    # 如果异常区间到数据末尾，使用基准值的threshold_ratio填充
                    for k in range(anomaly_start, anomaly_end + 1):
                        values[k] = baseline * threshold_ratio
            
            i = j  # 跳过已处理的异常区间
        else:
            i += 1
    
    # 2. 应用Savitzky-Golay滤波进行整体平滑
    if len(values) > window_size:
        # 确保window_size是奇数
        if window_size % 2 == 0:
            window_size += 1
        
        # 应用Savitzky-Golay滤波
        try:
            # 对数据进行平滑，但保留原始的特性
            smoothed = savgol_filter(values, window_size, 3)
            
            # 确保平滑后的数据不会小于相邻点的threshold_ratio
            for i in range(1, len(smoothed)):
                smoothed[i] = max(smoothed[i], smoothed[i-1] * threshold_ratio)
            
            values = smoothed
        except Exception as e:
            print(f"Savitzky-Golay滤波应用失败: {e}")
    
    filtered_df['Value'] = values
    return filtered_df
 
# 读取CSV数据
df1 = pd.read_csv('yuce_data/一取水.csv')
df2 = pd.read_csv('yuce_data/青龙港1.csv')
df3 = pd.read_csv('yuce_data/太仓石化盐度2.csv')
 
# 将时间列转换为datetime格式
df1['DateTime'] = pd.to_datetime(df1['DateTime'])
df2['DateTime'] = pd.to_datetime(df2['DateTime'])
df3['DateTime'] = pd.to_datetime(df3['DateTime'])
 
# 应用盐度数据异常过滤方法
filtered_df1 = filter_salinity_anomalies(df1, threshold_ratio=0.5, window_size=7, max_days=1)
filtered_df2 = filter_salinity_anomalies(df2, threshold_ratio=0.5, window_size=7, max_days=1)
filtered_df3 = filter_salinity_anomalies(df3, threshold_ratio=0.5, window_size=7, max_days=1)
 
# 创建图表，调整为4个子图
plt.figure(figsize=(16, 16))
 
# 绘制一取水的原始数据和过滤后的数据
plt.subplot(4, 1, 1)
plt.plot(df1['DateTime'], df1['Value'], 'o-', color='lightblue', linewidth=1, markersize=2, alpha=0.6, label='一取水(原始)')
plt.plot(filtered_df1['DateTime'], filtered_df1['Value'], 'o-', color='blue', linewidth=1.5, markersize=3, label='一取水(过滤后)')
plt.title('一取水盐度监测数据异常过滤', fontsize=16)
plt.ylabel('盐度值', fontsize=14)
plt.legend(loc='best', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
 
# 绘制青龙港1的原始数据和过滤后的数据
plt.subplot(4, 1, 2)
plt.plot(df2['DateTime'], df2['Value'], 's-', color='lightcoral', linewidth=1, markersize=2, alpha=0.6, label='青龙港1(原始)')
plt.plot(filtered_df2['DateTime'], filtered_df2['Value'], 's-', color='red', linewidth=1.5, markersize=3, label='青龙港1(过滤后)')
plt.title('青龙港1盐度监测数据异常过滤', fontsize=16)
plt.ylabel('盐度值', fontsize=14)
plt.legend(loc='best', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
 
# 绘制太仓石化盐度1的原始数据和过滤后的数据
plt.subplot(4, 1, 3)
plt.plot(df3['DateTime'], df3['Value'], '^-', color='lightgreen', linewidth=1, markersize=2, alpha=0.6, label='太仓石化1(原始)')
plt.plot(filtered_df3['DateTime'], filtered_df3['Value'], '^-', color='green', linewidth=1.5, markersize=3, label='太仓石化1(过滤后)')
plt.title('太仓石化盐度1监测数据异常过滤', fontsize=16)
plt.ylabel('盐度值', fontsize=14)
plt.legend(loc='best', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
 
# 计算过滤前后的差异
df1['Difference'] = filtered_df1['Value'] - df1['Value']
df2['Difference'] = filtered_df2['Value'] - df2['Value']
df3['Difference'] = filtered_df3['Value'] - df3['Value']
 
# 绘制差异
plt.subplot(4, 1, 4)
plt.plot(df1['DateTime'], df1['Difference'], 'o-', color='blue', linewidth=1.5, markersize=3, label='一取水(差异)')
plt.plot(df2['DateTime'], df2['Difference'], 's-', color='red', linewidth=1.5, markersize=3, label='青龙港1(差异)')
plt.plot(df3['DateTime'], df3['Difference'], '^-', color='green', linewidth=1.5, markersize=3, label='太仓石化1(差异)')
plt.title('数据处理前后差异', fontsize=16)
plt.xlabel('时间', fontsize=14)
plt.ylabel('差异值', fontsize=14)
plt.legend(loc='best', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
 
# 格式化x轴日期显示
for ax in plt.gcf().get_axes():
    ax.xaxis_date()
plt.gcf().autofmt_xdate()
 
# 显示图表
plt.tight_layout()
plt.show()
 
# 保存过滤后的数据
filtered_df1.to_csv('yuce_data/一取水_filtered.csv', index=False)
filtered_df2.to_csv('yuce_data/青龙港1_filtered.csv', index=False)
filtered_df3.to_csv('yuce_data/太仓石化盐度1_filtered.csv', index=False)
 
# 打印过滤前后的统计信息及处理区间
print("\n数据过滤前后统计信息:")
print("\n一取水:")
print(f"原始数据  - 最小值: {df1['Value'].min():.2f}, 最大值: {df1['Value'].max():.2f}, 均值: {df1['Value'].mean():.2f}, 标准差: {df1['Value'].std():.2f}")
print(f"过滤后数据 - 最小值: {filtered_df1['Value'].min():.2f}, 最大值: {filtered_df1['Value'].max():.2f}, 均值: {filtered_df1['Value'].mean():.2f}, 标准差: {filtered_df1['Value'].std():.2f}")
 
print("\n青龙港1:")
print(f"原始数据  - 最小值: {df2['Value'].min():.2f}, 最大值: {df2['Value'].max():.2f}, 均值: {df2['Value'].mean():.2f}, 标准差: {df2['Value'].std():.2f}")
print(f"过滤后数据 - 最小值: {filtered_df2['Value'].min():.2f}, 最大值: {filtered_df2['Value'].max():.2f}, 均值: {filtered_df2['Value'].mean():.2f}, 标准差: {filtered_df2['Value'].std():.2f}")
 
print("\n太仓石化盐度1:")
print(f"原始数据  - 最小值: {df3['Value'].min():.2f}, 最大值: {df3['Value'].max():.2f}, 均值: {df3['Value'].mean():.2f}, 标准差: {df3['Value'].std():.2f}")
print(f"过滤后数据 - 最小值: {filtered_df3['Value'].min():.2f}, 最大值: {filtered_df3['Value'].max():.2f}, 均值: {filtered_df3['Value'].mean():.2f}, 标准差: {filtered_df3['Value'].std():.2f}")
 
# 找出异常点，计算处理的区间数量和时间跨度
def get_anomaly_stats(df_orig, df_filtered):
    diff = abs(df_filtered['Value'] - df_orig['Value'])
    anomalies = df_orig[diff > 0.01]  # 差异大于0.01的点认为是异常点
    
    if len(anomalies) == 0:
        return 0, "无异常区间"
    
    # 计算连续的异常区间
    anomaly_indices = anomalies.index.tolist()
    intervals = []
    start_idx = anomaly_indices[0]
    prev_idx = start_idx
    
    for idx in anomaly_indices[1:]:
        if idx > prev_idx + 1:  # 不连续
            intervals.append((start_idx, prev_idx))
            start_idx = idx
        prev_idx = idx
    
    intervals.append((start_idx, prev_idx))  # 添加最后一个区间
    
    # 计算区间的时间跨度
    time_spans = []
    for start, end in intervals:
        if start == end:
            span = "单点"
        else:
            delta = df_orig.iloc[end]['DateTime'] - df_orig.iloc[start]['DateTime']
            hours = delta.total_seconds() / 3600
            if hours < 1:
                span = f"{int(hours * 60)}分钟"
            elif hours < 24:
                span = f"{int(hours)}小时"
            else:
                span = f"{int(hours/24)}天{int(hours%24)}小时"
        time_spans.append(span)
    
    return len(intervals), time_spans
 
anomaly_count1, time_spans1 = get_anomaly_stats(df1, filtered_df1)
anomaly_count2, time_spans2 = get_anomaly_stats(df2, filtered_df2)
anomaly_count3, time_spans3 = get_anomaly_stats(df3, filtered_df3)
 
print("\n异常区间统计:")
print(f"一取水 - 异常区间数量: {anomaly_count1}")
if anomaly_count1 > 0:
    print(f"区间时间跨度: {time_spans1}")
 
print(f"青龙港1 - 异常区间数量: {anomaly_count2}")
if anomaly_count2 > 0:
    print(f"区间时间跨度: {time_spans2}")
 
print(f"太仓石化盐度1 - 异常区间数量: {anomaly_count3}")
if anomaly_count3 > 0:
    print(f"区间时间跨度: {time_spans3}")