import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from numpy import ma
import pandas as pd
import math
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib import ticker, cm
import matplotlib.gridspec as gridspec
import matplotlib.colors as colors
%matplotlib inline
import seaborn as sns
from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from keras import layers
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.layers import Embedding
from keras.utils import np_utils, to_categorical
from keras.datasets import imdb
#from keras import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import models, regularizers, layers, optimizers, losses, metrics
from keras.optimizers import Adam
from keras.callbacks import Callback,ModelCheckpoint
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, GlobalAveragePooling1D
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
Data
# Read data
dataRaw = pd.read_csv('/kaggle/input/time-series/TimeSeries.csv')
RawTS = dataRaw.copy()
print(RawTS.shape)
RawTS.head()
(509632, 11)
labels = pd.read_csv('/kaggle/input/time-series/labelsTimeSeries.csv')
print(labels.shape)
labels.head()
(509632, 1)
labels.label.value_counts(), labels.label.value_counts(normalize=True)
(0 509189
1 443
Name: label, dtype: int64,
0 0.999131
1 0.000869
Name: label, dtype: float64)
RawTS.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509632 entries, 0 to 509631
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 v1 509632 non-null float64
1 v2 509632 non-null float64
2 v3 509632 non-null float64
3 v4 509632 non-null float64
4 v5 509632 non-null float64
5 v6 509632 non-null float64
6 v7 509632 non-null float64
7 v8 509632 non-null float64
8 v9 509632 non-null float64
9 v10 509632 non-null float64
10 v11 509632 non-null float64
dtypes: float64(11)
memory usage: 42.8 MB
RawTS.describe()
# NORMALIZE data
x = RawTS.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
StandardScaler = StandardScaler()
x_scaled = StandardScaler.fit_transform(x)
dfNorm = pd.DataFrame(x_scaled, columns=RawTS.columns)
print(dfNorm.shape)
dfNorm.head()
(509632, 11)
plt.style.use('seaborn-whitegrid')
fig = plt.figure()
ax = plt.axes()
x = np.linspace(0, dfNorm.shape[0])
ax.plot(dfNorm['v1'])
ax.plot(labels['label']/10)
for col in dfNorm.columns:
plt.style.use('seaborn-whitegrid')
fig = plt.figure()
ax = plt.axes()
plt.title(col)
x = np.linspace(0, dfNorm.shape[0])
ax.plot(dfNorm[col])
ax.plot(labels['label']/10)
Note that ONE event is actually a TIME SERIES of events around it
# Some viz of normalized data ONE EVENT
eventRow = 10702
minX = eventRow - 10
maxX = eventRow + 10
for col in dfNorm.columns:
plt.style.use('seaborn-whitegrid')
fig = plt.figure()
ax = plt.axes()
plt.title(col)
x = np.linspace(minX, maxX)
ax.plot(dfNorm[col][minX:maxX])
ax.plot(labels['label'][minX:maxX])
Seems that v7-v11 don't change much during ONE event...let's check the stats on no-event vs events
dfNorm['label'] = labels['label']
dfNorm.shape
(509632, 12)
NoEvent = dfNorm[dfNorm['label'] == 0]
OnlyEvent = dfNorm[dfNorm['label'] == 1]
print(NoEvent.shape)
print(OnlyEvent.shape)
(509189, 12)
(443, 12)OnlyEvent.describe()
# Viz the distribution of each feature: no-event vs event
for col in dfNorm.columns:
plt.figure()
sns.distplot(OnlyEvent[col]).set_title(col)
sns.distplot(NoEvent[col])
# As the above dist differences between event and no-event may be because of the size differences...
# Repeat the above with a sample of the same size for no-event ... downsample
NoEventSample = NoEvent.sample(443)
NoEventSample.shape
(443, 12)
# Viz the distribution of each feature: no event vs event - SAME size
for col in dfNorm.columns:
plt.figure()
sns.distplot(OnlyEvent[col],fit_kws={"color":"red"}).set_title(col)
sns.distplot(NoEventSample[col])
Multi-variate Gaussian probability density function - as the baseline
Split into train and test
print(NoEvent.shape)
print(OnlyEvent.shape)
Cols = list(NoEvent)[:-1]
NoEvent = NoEvent[Cols]
OnlyEvent = OnlyEvent[Cols]
print(NoEvent.shape)
print(OnlyEvent.shape)
(509189, 12)
(443, 12)
(509189, 11)
(443, 11)
# CREATE the TRAIN and TEST sets
# Anom data is ONLY in TEST - not in TRAIN
num_test = 100000
shuffled_data = NoEvent.sample(frac=1, random_state=47)[:-num_test].values
X_train = shuffled_data
#X_valid = np.concatenate([shuffled_data[-2*num_test:-num_test], fraud_pca_data[:246]])
#y_valid = np.concatenate([np.zeros(num_test), np.ones(246)])
X_test = np.concatenate([shuffled_data[-num_test:], OnlyEvent[:]])
y_test = np.concatenate([np.zeros(num_test), np.ones(OnlyEvent.shape[0])])
print("normal ", dfNorm.shape)
print("OnlyEvents", OnlyEvent.shape)
print("OnlyEvents data only in Test with NONE in the training")
print("X_train ", X_train.shape)
print("X_test ", X_test.shape)
print("y_test ", y_test.shape)
normal (509632, 12)
OnlyEvents (443, 11)
OnlyEvents data only in Test with NONE in the training
X_train (409189, 11)
X_test (100443, 11)
y_test (100443,)
p = multivariate_normal(mean=np.mean(X_train,axis=0), cov=np.cov(X_train.T))
x = p.pdf(X_train)
probNorm = x / x.sum() # Normalize pdf
print("max prob of x on X_train", max(probNorm))
print("mean prob of x on X_train", np.mean(probNorm))
print('-' * 60)
MyTrain = np.mean(probNorm)
x = p.pdf(X_test)
probNorm = x / x.sum() # Normalize pdf
print("max prob of x on X_test", max(probNorm))
print("mean prob of x on X_test", np.mean(probNorm))
print('-' * 60)
MyTest = np.mean(probNorm)
x = p.pdf(OnlyEvent)
probNorm = x / x.sum() # Normalize pdf
print("max prob of x on OnlyEvent", max(probNorm))
print("mean prob of x on OnlyEvent", np.mean(probNorm))
print('-' * 60)
print('Difference between mean prob of Train vs Test ', MyTrain - MyTest)
max prob of x on X_train 7.542779528167279e-06
mean prob of x on X_train 2.443858461493344e-06
------------------------------------------------------------
max prob of x on X_test 3.087037029140375e-05
mean prob of x on X_test 9.95589538345131e-06
------------------------------------------------------------
max prob of x on OnlyEvent 0.7707216579692563
mean prob of x on OnlyEvent 0.002257336343115125
------------------------------------------------------------
Difference between mean prob of Train vs Test -7.512036921957967e-06
# Find best epsilon re F1 score
x = p.pdf(X_test)
x = x / x.sum() # Normalize pdf
EpsF1 = []
epsilons = [1e-5, 1e-10, 1e-15, 1e-20, 1e-25, 1e-30,1e-35, 1e-40, 1e-45, 1e-50, 1e-55, 1e-60, 1e-65, 1e-70, 1e-75, 1e-80, 1e-85, 1e-90]
for e in range(len(epsilons)):
eps = epsilons[e]
pred = (x <= eps)
f = f1_score(y_test, pred, average='binary')
#print("F1 score on test", round(f,4), " with epsilon ", eps)
EpsF1.append([eps, round(f,4)])
EpsF1df = pd.DataFrame(EpsF1, columns = ['epsilon', 'F1'])
EpsF1df.head(20)
EpsF1df.plot.line("epsilon","F1")
plt.xscale('log')
plt.xlim(1e-5, 1e-95)
plt.title("F1 vs decreasing log Epsilon")
plt.show()
Time series approach
- Create periods of 9 time steps labeled as NoEvent or OnlyEvent
- Events are in the middle (there are 4 rows before and 4 after) of the time period, and the chunk is labeled as OnlyEvent
- No-events time periods have no event within their 9 time steps, labeled NoEvent
- So the algorithm would hopefully be able to separate the events
# Identify the row where event
EventsRows = dfNorm.index[dfNorm['label'] == 1]
len(EventsRows)
443
# Returns minimum difference between any pair
def findMinDiff(arr, n):
# Initialize difference as infinite
diff = 10**20
# Find the min diff by comparing difference
# of all possible pairs in given array
for i in range(n-1):
for j in range(i+1,n):
if abs(arr[i]-arr[j]) < diff:
diff = abs(arr[i] - arr[j])
# Return min diff
return diff
n = len(EventsRows)
print("Minimum difference between events is " + str(findMinDiff(EventsRows, n)),'time steps')
Minimum difference between events is 15 time steps
Chunk into buckets the NoEvent and OnlyEvent ... chunk size = 9 time steps
def chunk(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
ChunkCounter = 0
result_arrList = []
chunkSize = 9
for df_chunk in chunk(dfNorm, chunkSize):
if df_chunk.label.any() == 1 or df_chunk.shape[0] < chunkSize:
pass
else:
result_arrList.append(df_chunk.iloc[:,:-1])
ChunkCounter = ChunkCounter + 1
#print(df_chunk)
ChunkCounter
56182
# No-event 56k periods - chunks of 9 time steps and 11 features each:
NoEvent_arr = np.stack(result_arrList, axis=0)
NoEvent_arr.shape
(56182, 9, 11)
# For the OnlyEvent chunking
idx = dfNorm.index.get_indexer_for(dfNorm[dfNorm.label == 1].index)
print(idx.shape)
n=4
OnlyEvent_arr = dfNorm.iloc[np.unique(np.concatenate([np.arange(max(i-n,0), min(i+n+1, len(dfNorm)))
for i in idx]))]
pd.options.display.max_rows = 100
OnlyEvent_arr.head(100)
# After checking the event label is in the middle of each OnlyEvent period, remove the label and make an array of chunks
OnlyEvent_arr = OnlyEvent_arr.iloc[:,:-1]
OnlyEvent_arr.head()
ChunkCounter = 0
result_arrList = []
chunkSize = 9
for df_chunk in chunk(OnlyEvent_arr, chunkSize):
if df_chunk.shape[0] < chunkSize:
pass
else:
result_arrList.append(df_chunk)
ChunkCounter = ChunkCounter + 1
#print(df_chunk)
ChunkCounter
443
# OnlyEvent 443 periods - chunks of 9 time steps and 11 features each:
OnlyEvent_arr = np.stack(result_arrList, axis=0)
OnlyEvent_arr.shape
(443, 9, 11)
Now the anomaly prob is 443 / 56,182 = 0.79% ... much easier, about 9 times easier for the model...
# Concatenate the 2 arrays before split into train and test
AllFeat_arr = np.append(NoEvent_arr, OnlyEvent_arr, axis = 0)
AllFeat_arr.shape
(56625, 9, 11)
# Create the label ... y
y0 = np.zeros(NoEvent_arr.shape[0])
y1 = np.ones(OnlyEvent_arr.shape[0])
y = np.append(y0, y1, axis = 0)
y.shape
(56625,)
56k samples, each has 9 time steps and 11 features (56k tables of 9 rows X 11 cols)
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(AllFeat_arr, y, test_size=0.2, random_state=7)
print ('X_train: ', X_train.shape)
print ('X_test: ', X_test.shape)
print ('y_train: ', y_train.shape)
print ('y_test: ', y_test.shape)
X_train: (45300, 9, 11)
X_test: (11325, 9, 11)
y_train: (45300,)
y_test: (11325,)
# We have to redimension the arrays for the Conv2D digestion benefit
data_train_wide = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2], 1))
data_test_wide = X_test.reshape((X_test.shape[0], X_test.shape[1], X_test.shape[2], 1))
print(data_train_wide.shape)
print(data_test_wide.shape)
(45300, 9, 11, 1)
(11325, 9, 11, 1)
Conv2D NN
Baseline accuracy by majority voting = 100 - 0.79% = 99.21%
import keras.backend as K
def get_f1(y_true, y_pred): #taken from old keras source code
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
recall = true_positives / (possible_positives + K.epsilon())
f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
return f1_val
# NN model
n_filters = 64
fsize = 5 # Note that kernel size (1, fsize) = it is not a square kernel...it is rectangular
window_size = 9 # Number of time steps in one period
n_features = 11 # Number of cols in one sample (one table)
MyModel = models.Sequential()
MyModel.add(layers.Conv2D(n_filters, fsize, activation='relu', input_shape=(window_size, n_features, 1)))
MyModel.add(layers.Flatten())
MyModel.add(layers.Dense(256, activation='relu'))
#MyModel.add(layers.Dropout(0.2))
MyModel.add(layers.Dense(1, activation='sigmoid'))
MyModel.compile(optimizer=optimizers.Adam(lr=1e-4),
loss='binary_crossentropy',
metrics=[get_f1])
#metrics=['binary_accuracy'])
print(MyModel.summary())
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 5, 7, 64) 1664
_________________________________________________________________
flatten (Flatten) (None, 2240) 0
_________________________________________________________________
dense (Dense) (None, 256) 573696
_________________________________________________________________
dense_1 (Dense) (None, 1) 257
=================================================================
Total params: 575,617
Trainable params: 575,617
Non-trainable params: 0
_________________________________________________________________
None
# Train / fit
history = MyModel.fit(data_train_wide, y_train,
validation_split=0.2,
epochs = 20,
batch_size = 16)
#Learning curves ... F1
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training F1')
plt.plot(epochs, val_acc, 'b', label='Validation F1')
plt.title('Training and validation F1')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
# Final Predict
# NOTE final_predictions is a list of probabilities
final_predictions = MyModel.predict(data_test_wide)
final_predictions.shape
(11325, 1)
# Modify the raw final_predictions - prediction probs into 0 and 1
Preds = final_predictions.copy()
#print(len(Preds))
#print(Preds)
Preds[ np.where( Preds >= 0.5 ) ] = 1
Preds[ np.where( Preds < 0.5 ) ] = 0
Preds.shape
(11325, 1)
# Confusion matrix
from sklearn import metrics
conf_mx = metrics.confusion_matrix(y_test, Preds)
TN = conf_mx[0,0]
FP = conf_mx[0,1]
FN = conf_mx[1,0]
TP = conf_mx[1,1]
print ('TN: ', TN)
print ('FP: ', FP)
print ('FN: ', FN)
print ('TP: ', TP)
recall = TP/(TP+FN)
precision = TP/(TP+FP)
print (recall, precision)
TN: 11242
FP: 0
FN: 2
TP: 81
0.9759036144578314 1.0
def plot_confusion_matrix(cm,target_names,title='Confusion matrix',cmap=None,
normalize=False):
import itertools
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 1.5 if normalize else cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plt.text(j, i, "{:0.4f}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
else:
plt.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
plt.show()
plot_confusion_matrix(conf_mx,
normalize = False,
target_names = ['NoEvent', 'Event'],
title = "Confusion Matrix on test")
from sklearn.metrics import precision_recall_curve, average_precision_score, auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, Preds)
# calculate F1 score
f1 = metrics.f1_score(y_test, Preds)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
plt.plot(recall, precision, marker='.')
# show the plot
plt.show()
Results
Adam(lr=1e-4) , batch size = 16, optimized on get_f1, square kernel size = (fsize, fsize) = fsize, 20 epochs
- Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=2 ... F1 = 0.975
- Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=3 ... F1 = 0.982
- Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=4 ... F1 = 0.988
- Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=5 ... F1 = 0.988
- Conv2D 64 filters, Flatten, Dense 256, Sigmoid, fsize=6 ... F1 = 0.988
- Conv2D 64 filters, Flatten, Dense 256, Dropout 0.2, Sigmoid, fsize=5 ... F1 = 0.976
- Conv2D 64 filters, Flatten, Dense 256, Dropout 0.2, Sigmoid, rectangular kernel size = (1, 5) ... F1 = 0.982
- Conv2D 64 filters, Flatten, Dense 256, Sigmoid, 20 epochs, optimized on get_f1 ... F1 = 0.982
- Conv2D 64 filters, Flatten, Dense 256, Sigmoid, 20 epochs, optimized on binary_acc ... F1 = 0.956
- Conv2D 64 filters, Flatten, Sigmoid, 50 epochs ... F1 = 0.91
知乎学术咨询:https://www.zhihu.com/consult/people/792359672131756032?isMe=1
担任《Mechanical System and Signal Processing》等审稿专家,擅长领域:现代信号处理,机器学习,深度学习,数字孪生,时间序列分析,设备缺陷检测、设备异常检测、设备智能故障诊断与健康管理PHM等。