基于卷积神经网络的多元时间序列异常检测（Python）-EW帮帮网

import warnings
warnings.filterwarnings('ignore')


import numpy as np  
import pandas as pd 
from numpy import ma
import pandas as pd
import math
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib import ticker, cm
import matplotlib.gridspec as gridspec
import matplotlib.colors as colors
%matplotlib inline


import seaborn as sns


from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler 


from keras import layers
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.layers import Embedding
from keras.utils import np_utils, to_categorical
from keras.datasets import imdb
#from keras import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import models, regularizers, layers, optimizers, losses, metrics
from keras.optimizers import Adam


from keras.callbacks import Callback,ModelCheckpoint
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, GlobalAveragePooling1D
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K






import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Data

# Read data


dataRaw = pd.read_csv('/kaggle/input/time-series/TimeSeries.csv')
RawTS = dataRaw.copy()
print(RawTS.shape)
RawTS.head()
(509632, 11)

labels = pd.read_csv('/kaggle/input/time-series/labelsTimeSeries.csv')
print(labels.shape)
labels.head()
(509632, 1)
labels.label.value_counts(), labels.label.value_counts(normalize=True)
(0    509189
 1       443
 Name: label, dtype: int64,
 0    0.999131
 1    0.000869
 Name: label, dtype: float64)
RawTS.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509632 entries, 0 to 509631
Data columns (total 11 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   v1      509632 non-null  float64
 1   v2      509632 non-null  float64
 2   v3      509632 non-null  float64
 3   v4      509632 non-null  float64
 4   v5      509632 non-null  float64
 5   v6      509632 non-null  float64
 6   v7      509632 non-null  float64
 7   v8      509632 non-null  float64
 8   v9      509632 non-null  float64
 9   v10     509632 non-null  float64
 10  v11     509632 non-null  float64
dtypes: float64(11)
memory usage: 42.8 MB
RawTS.describe()

# NORMALIZE data


x = RawTS.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
StandardScaler = StandardScaler()
x_scaled = StandardScaler.fit_transform(x)
dfNorm = pd.DataFrame(x_scaled, columns=RawTS.columns)


print(dfNorm.shape)
dfNorm.head()
(509632, 11)

plt.style.use('seaborn-whitegrid')
fig = plt.figure()
ax = plt.axes()


x = np.linspace(0, dfNorm.shape[0])
ax.plot(dfNorm['v1'])
ax.plot(labels['label']/10)

for col in dfNorm.columns:
    plt.style.use('seaborn-whitegrid')
    fig = plt.figure()
    ax = plt.axes()
    plt.title(col)


    x = np.linspace(0, dfNorm.shape[0])
    ax.plot(dfNorm[col])
    ax.plot(labels['label']/10)

Note that ONE event is actually a TIME SERIES of events around it

# Some viz of normalized data ONE EVENT


eventRow = 10702


minX = eventRow - 10
maxX = eventRow + 10


for col in dfNorm.columns:
    plt.style.use('seaborn-whitegrid')
    fig = plt.figure()
    ax = plt.axes()
    plt.title(col)
    
    x = np.linspace(minX, maxX)
    ax.plot(dfNorm[col][minX:maxX])
    ax.plot(labels['label'][minX:maxX])

Seems that v7-v11 don't change much during ONE event...let's check the stats on no-event vs events

dfNorm['label'] = labels['label']
dfNorm.shape
(509632, 12)
NoEvent = dfNorm[dfNorm['label'] == 0]
OnlyEvent = dfNorm[dfNorm['label'] == 1]
print(NoEvent.shape)
print(OnlyEvent.shape)
(509189, 12)
(443, 12)OnlyEvent.describe()

# Viz the distribution of each feature: no-event vs event




for col in dfNorm.columns:

    plt.figure()

    sns.distplot(OnlyEvent[col]).set_title(col)

    sns.distplot(NoEvent[col])

# As the above dist differences between event and no-event may be because of the size differences...
# Repeat the above with a sample of the same size for no-event ... downsample


NoEventSample = NoEvent.sample(443)
NoEventSample.shape
(443, 12)
# Viz the distribution of each feature: no event vs event - SAME size


for col in dfNorm.columns:
    plt.figure()
    sns.distplot(OnlyEvent[col],fit_kws={"color":"red"}).set_title(col)
    sns.distplot(NoEventSample[col])

Multi-variate Gaussian probability density function - as the baseline

Split into train and test

print(NoEvent.shape)
print(OnlyEvent.shape)


Cols = list(NoEvent)[:-1] 
NoEvent = NoEvent[Cols]
OnlyEvent = OnlyEvent[Cols]
print(NoEvent.shape)
print(OnlyEvent.shape)
(509189, 12)
(443, 12)
(509189, 11)
(443, 11)
# CREATE the TRAIN and TEST sets
# Anom data is ONLY in TEST - not in TRAIN


num_test = 100000
shuffled_data = NoEvent.sample(frac=1, random_state=47)[:-num_test].values
X_train = shuffled_data


#X_valid = np.concatenate([shuffled_data[-2*num_test:-num_test], fraud_pca_data[:246]])
#y_valid = np.concatenate([np.zeros(num_test), np.ones(246)])


X_test = np.concatenate([shuffled_data[-num_test:], OnlyEvent[:]])
y_test = np.concatenate([np.zeros(num_test), np.ones(OnlyEvent.shape[0])])


print("normal ", dfNorm.shape)
print("OnlyEvents", OnlyEvent.shape)
print("OnlyEvents data only in Test with NONE in the training")
print("X_train ", X_train.shape)
print("X_test ", X_test.shape)
print("y_test ", y_test.shape)
normal  (509632, 12)
OnlyEvents (443, 11)
OnlyEvents data only in Test with NONE in the training
X_train  (409189, 11)
X_test  (100443, 11)
y_test  (100443,)
p = multivariate_normal(mean=np.mean(X_train,axis=0), cov=np.cov(X_train.T))


x = p.pdf(X_train) 
probNorm = x / x.sum() # Normalize pdf
print("max prob of x on X_train", max(probNorm))
print("mean prob of x on X_train", np.mean(probNorm))
print('-' * 60)
MyTrain = np.mean(probNorm)


x = p.pdf(X_test) 
probNorm = x / x.sum() # Normalize pdf
print("max prob of x on X_test", max(probNorm))
print("mean prob of x on X_test", np.mean(probNorm))
print('-' * 60)
MyTest = np.mean(probNorm)


x = p.pdf(OnlyEvent) 
probNorm = x / x.sum() # Normalize pdf
print("max prob of x on OnlyEvent", max(probNorm))
print("mean prob of x on OnlyEvent", np.mean(probNorm))
print('-' * 60)


print('Difference between mean prob of Train vs Test ', MyTrain - MyTest)
max prob of x on X_train 7.542779528167279e-06
mean prob of x on X_train 2.443858461493344e-06
------------------------------------------------------------
max prob of x on X_test 3.087037029140375e-05
mean prob of x on X_test 9.95589538345131e-06
------------------------------------------------------------
max prob of x on OnlyEvent 0.7707216579692563
mean prob of x on OnlyEvent 0.002257336343115125
------------------------------------------------------------
Difference between mean prob of Train vs Test  -7.512036921957967e-06
# Find best epsilon re F1 score


x = p.pdf(X_test)
x = x / x.sum() # Normalize pdf


EpsF1 = []




epsilons = [1e-5, 1e-10, 1e-15, 1e-20, 1e-25, 1e-30,1e-35, 1e-40, 1e-45, 1e-50, 1e-55, 1e-60, 1e-65, 1e-70, 1e-75, 1e-80, 1e-85, 1e-90]




for e in range(len(epsilons)):
    eps = epsilons[e]
    pred = (x <= eps)
    f = f1_score(y_test, pred, average='binary')
    #print("F1 score on test", round(f,4), " with epsilon ", eps)
    EpsF1.append([eps, round(f,4)])
    
EpsF1df = pd.DataFrame(EpsF1, columns = ['epsilon', 'F1'])
EpsF1df.head(20)

EpsF1df.plot.line("epsilon","F1")
plt.xscale('log')
plt.xlim(1e-5, 1e-95)
plt.title("F1 vs decreasing log Epsilon")
plt.show()

Time series approach

Create periods of 9 time steps labeled as NoEvent or OnlyEvent
Events are in the middle (there are 4 rows before and 4 after) of the time period, and the chunk is labeled as OnlyEvent
No-events time periods have no event within their 9 time steps, labeled NoEvent
So the algorithm would hopefully be able to separate the events

# Identify the row where event
EventsRows = dfNorm.index[dfNorm['label'] == 1]
len(EventsRows)
443
# Returns minimum difference between any pair 
def findMinDiff(arr, n): 
    # Initialize difference as infinite 
    diff = 10**20
      
    # Find the min diff by comparing difference 
    # of all possible pairs in given array 
    for i in range(n-1): 
        for j in range(i+1,n): 
            if abs(arr[i]-arr[j]) < diff: 
                diff = abs(arr[i] - arr[j]) 
  
    # Return min diff 
    return diff
n = len(EventsRows) 
print("Minimum difference between events is " + str(findMinDiff(EventsRows, n)),'time steps')
Minimum difference between events is 15 time steps

Chunk into buckets the NoEvent and OnlyEvent ... chunk size = 9 time steps

def chunk(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))
ChunkCounter = 0
result_arrList = []
chunkSize = 9


for df_chunk in chunk(dfNorm, chunkSize):
    if df_chunk.label.any() == 1 or df_chunk.shape[0] < chunkSize:
        pass
    else:
        result_arrList.append(df_chunk.iloc[:,:-1])
        ChunkCounter = ChunkCounter + 1
        #print(df_chunk)
    
ChunkCounter
56182
# No-event 56k periods - chunks of 9 time steps and 11 features each:


NoEvent_arr = np.stack(result_arrList, axis=0)
NoEvent_arr.shape
(56182, 9, 11)
# For the OnlyEvent chunking


idx = dfNorm.index.get_indexer_for(dfNorm[dfNorm.label == 1].index)
print(idx.shape)


n=4


OnlyEvent_arr = dfNorm.iloc[np.unique(np.concatenate([np.arange(max(i-n,0), min(i+n+1, len(dfNorm)))
                                            for i in idx]))]


pd.options.display.max_rows = 100
OnlyEvent_arr.head(100)
# After checking the event label is in the middle of each OnlyEvent period, remove the label and make an array of chunks


OnlyEvent_arr = OnlyEvent_arr.iloc[:,:-1]
OnlyEvent_arr.head()

ChunkCounter = 0
result_arrList = []
chunkSize = 9


for df_chunk in chunk(OnlyEvent_arr, chunkSize):
    if df_chunk.shape[0] < chunkSize:
        pass
    else:
        result_arrList.append(df_chunk)
        ChunkCounter = ChunkCounter + 1
        #print(df_chunk)
    
ChunkCounter
443
# OnlyEvent 443 periods - chunks of 9 time steps and 11 features each:


OnlyEvent_arr = np.stack(result_arrList, axis=0)
OnlyEvent_arr.shape
(443, 9, 11)

Now the anomaly prob is 443 / 56,182 = 0.79% ... much easier, about 9 times easier for the model...

# Concatenate the 2 arrays before split into train and test


AllFeat_arr = np.append(NoEvent_arr, OnlyEvent_arr, axis = 0)
AllFeat_arr.shape
(56625, 9, 11)
# Create the label ... y


y0 = np.zeros(NoEvent_arr.shape[0])


y1 = np.ones(OnlyEvent_arr.shape[0])


y = np.append(y0, y1, axis = 0)
y.shape
(56625,)

56k samples, each has 9 time steps and 11 features (56k tables of 9 rows X 11 cols)

# Split into train and test


X_train, X_test, y_train, y_test = train_test_split(AllFeat_arr, y, test_size=0.2, random_state=7)
print ('X_train: ', X_train.shape)
print ('X_test: ', X_test.shape)
print ('y_train: ', y_train.shape)
print ('y_test: ', y_test.shape)
X_train:  (45300, 9, 11)
X_test:  (11325, 9, 11)
y_train:  (45300,)
y_test:  (11325,)
# We have to redimension the arrays for the Conv2D digestion benefit


data_train_wide = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2], 1))
data_test_wide = X_test.reshape((X_test.shape[0], X_test.shape[1], X_test.shape[2], 1))


print(data_train_wide.shape)
print(data_test_wide.shape)
(45300, 9, 11, 1)
(11325, 9, 11, 1)

Conv2D NN

Baseline accuracy by majority voting = 100 - 0.79% = 99.21%

import keras.backend as K


def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val
# NN model


n_filters = 64
fsize = 5  # Note that kernel size (1, fsize) = it is not a square kernel...it is rectangular
window_size = 9   # Number of time steps in one period
n_features = 11 # Number of cols in one sample (one table)




MyModel = models.Sequential()
MyModel.add(layers.Conv2D(n_filters, fsize, activation='relu', input_shape=(window_size, n_features, 1)))
MyModel.add(layers.Flatten())
MyModel.add(layers.Dense(256, activation='relu'))
#MyModel.add(layers.Dropout(0.2))


MyModel.add(layers.Dense(1, activation='sigmoid'))


MyModel.compile(optimizer=optimizers.Adam(lr=1e-4), 
              loss='binary_crossentropy', 
              metrics=[get_f1])
              #metrics=['binary_accuracy'])


print(MyModel.summary())
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 5, 7, 64)          1664      
_________________________________________________________________
flatten (Flatten)            (None, 2240)              0         
_________________________________________________________________
dense (Dense)                (None, 256)               573696    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
=================================================================
Total params: 575,617
Trainable params: 575,617
Non-trainable params: 0
_________________________________________________________________
None
# Train / fit


history = MyModel.fit(data_train_wide, y_train, 
                      validation_split=0.2, 
                      epochs = 20, 
                      batch_size = 16)
#Learning curves ... F1


acc = history.history['get_f1'] 
val_acc = history.history['val_get_f1'] 
loss = history.history['loss'] 
val_loss = history.history['val_loss'] 
epochs = range(1, len(acc) + 1) 
plt.plot(epochs, acc, 'bo', label='Training F1') 
plt.plot(epochs, val_acc, 'b', label='Validation F1') 
plt.title('Training and validation F1') 
plt.legend() 
plt.figure() 
plt.plot(epochs, loss, 'bo', label='Training loss') 
plt.plot(epochs, val_loss, 'b', label='Validation loss') 
plt.title('Training and validation loss') 
plt.legend() 
plt.show()

# Final Predict
# NOTE final_predictions is a list of probabilities


final_predictions = MyModel.predict(data_test_wide)
final_predictions.shape
(11325, 1)
# Modify the raw final_predictions - prediction probs into 0 and 1


Preds = final_predictions.copy()
#print(len(Preds))
#print(Preds)
Preds[ np.where( Preds >= 0.5 ) ] = 1
Preds[ np.where( Preds < 0.5 ) ] = 0


Preds.shape
(11325, 1)
# Confusion matrix


from sklearn import metrics
conf_mx = metrics.confusion_matrix(y_test, Preds)


TN = conf_mx[0,0]
FP = conf_mx[0,1]
FN = conf_mx[1,0]
TP = conf_mx[1,1]


print ('TN: ', TN)
print ('FP: ', FP)
print ('FN: ', FN)
print ('TP: ', TP)


recall = TP/(TP+FN)
precision = TP/(TP+FP)


print (recall, precision)
TN:  11242
FP:  0
FN:  2
TP:  81
0.9759036144578314 1.0
def plot_confusion_matrix(cm,target_names,title='Confusion matrix',cmap=None,
                          normalize=False):
    import itertools
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy


    if cmap is None:
        cmap = plt.get_cmap('Blues')


    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()


    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)
        
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")




    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
plot_confusion_matrix(conf_mx, 
                      normalize    = False,
                      target_names = ['NoEvent', 'Event'],
                      title        = "Confusion Matrix on test")

from sklearn.metrics import precision_recall_curve, average_precision_score, auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, Preds)
# calculate F1 score
f1 = metrics.f1_score(y_test, Preds)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
plt.plot(recall, precision, marker='.')
# show the plot
plt.show()

Results

Adam(lr=1e-4) , batch size = 16, optimized on get_f1, square kernel size = (fsize, fsize) = fsize, 20 epochs

Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=2 ... F1 = 0.975
Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=3 ... F1 = 0.982
Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=4 ... F1 = 0.988
Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=5 ... F1 = 0.988
Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=6 ... F1 = 0.988
Conv2D 64 filters, Flatten, Dense 256, Dropout 0.2, Sigmoid, fsize=5 ... F1 = 0.976
Conv2D 64 filters, Flatten, Dense 256, Dropout 0.2, Sigmoid, rectangular kernel size = (1, 5) ... F1 = 0.982
Conv2D 64 filters, Flatten, Dense 256, Sigmoid, 20 epochs, optimized on get_f1 ... F1 = 0.982
Conv2D 64 filters, Flatten, Dense 256, Sigmoid, 20 epochs, optimized on binary_acc ... F1 = 0.956
Conv2D 64 filters, Flatten, Sigmoid, 50 epochs ... F1 = 0.91

知乎学术咨询：https://www.zhihu.com/consult/people/792359672131756032?isMe=1

担任《Mechanical System and Signal Processing》等审稿专家，擅长领域：现代信号处理，机器学习，深度学习，数字孪生，时间序列分析，设备缺陷检测、设备异常检测、设备智能故障诊断与健康管理PHM等。

基于卷积神经网络的多元时间序列异常检测（Python）