from glob import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
def reshape_data(t_d, cla):
one_data = []
# two_data=[]
# thr_data=[]
for t in tqdm(t_d):
t_data = pd.read_csv(t)
one = []
# two = []
# thr = []
for i in range(0, len(t_data), 188):
j = 188 + i
temp = t_data.values[i:j, :3]
if temp.shape[0] != 188:
continue
temp[187, :] = [cla] * 3
# print()
one.append(temp)
# two.append(t_data.values[i:j, 1])
# thr.append(t_data.values[i:j, 2])
# one = pd.DataFrame(one)
# two = pd.DataFrame(two)
# thr = pd.DataFrame(thr)
# one[187] = cla
# two[187] = cla
# thr[187] = cla
one_data.append(one)
# two_data.append(two)
# thr_data.append(thr)
# return one_data,two_data,thr_data
return one_data
def to_csv_data():
t1 = glob("1/*.csv")
t2 = glob("2/*.csv")
t3 = glob("3/*.csv")
t4 = glob("4/*.csv")
t5 = glob("5/*.csv")
a_data = []
# b_data = []
# c_data = []
# a, b, c = reshape_data(t1[:2], cla=0)
a = reshape_data(t1, cla=0)
np.random.shuffle(a)
a = a[:1000]
a_data += a
# b_data += b
# c_data += c
# a, b, c = reshape_data(t2[:2], cla=1)
a = reshape_data(t2, cla=1)
np.random.shuffle(a)
a = a[:1000]
a_data += a
# b_data += b
# c_data += c
a = reshape_data(t3, cla=2)
np.random.shuffle(a)
a = a[:1000]
# a, b, c = reshape_data(t3[:2], cla=2)
a_data += a
# b_data += b
# c_data += c
# a, b, c = reshape_data(t4[:2], cla=3)
a = reshape_data(t4, cla=3)
np.random.shuffle(a)
a = a[:1000]
a_data += a
# b_data += b
# c_data += c
# a, b, c = reshape_data(t5[:2], cla=4)
a = reshape_data(t5, cla=4)
np.random.shuffle(a)
a = a[:1000]
a_data += a
# b_data += b
# c_data += c
# pd.concat(a_data).to_csv("a_data.csv", index=False)
a_data = np.vstack(a_data)
np.savez_compressed("a_data", a_data)
# pd.concat(b_data).to_csv("b_data.csv", index=False)
# pd.concat(c_data).to_csv("c_data.csv", index=False)
def split_data():
a = np.load("a_data.npz")
a = a["arr_0"]
# a = a.values
# np.random.shuffle(a)
a = a[np.random.permutation(a.shape[0])]
np.savez_compressed("test", a[int(a.shape[0] * 0.8):])
for i in range(0, int(a.shape[0] * 0.8), int(a.shape[0] * 0.8) // 16):
j = i + int(a.shape[0] * 0.8) // 16
if j < int(a.shape[0] * 0.8):
np.savez_compressed("train_{}".format(i), a[i:j])
else:
np.savez_compressed("train_{}".format(i), int(a.shape[0] * 0.8))
if __name__ == '__main__':
# 数据预处理
to_csv_data()
# 数据拆分
split_data()
这段代码用于数据预处理和数据拆分。
首先,导入了需要使用的库:glob用于获取文件路径,numpy用于处理数组,pandas用于处理数据框,tqdm用于显示进度条。
函数reshape_data(t_d, cla)
用于将原始数据进行reshape操作。它首先遍历传入的文件路径列表t_d,然后使用pd.read_csv函数读取每个文件的数据。接着,使用两个嵌套循环遍历每个文件的数据,每次取188行数据并将其存储在一个临时变量temp中。如果temp的行数不等于188,则跳过该数据。最后,将临时变量temp的最后一行设置为标签cla,并将其添加到一个列表one中。最后,将列表one添加到结果列表one_data中。返回结果列表one_data。
函数to_csv_data()
用于将数据转换为csv格式。它首先使用glob函数获取每个类别的数据文件路径列表t1、t2、t3、t4、t5。然后,分别调用reshape_data()
函数对每个类别的数据进行reshape操作,并将结果存储在列表a中。之后,将列表a中的数据随机打乱,并限制每个类别的样本数量为1000。最后,将列表a中的数据堆叠成一个numpy数组,并使用np.savez_compressed函数将其保存为压缩的npz文件。
函数split_data()
用于将数据拆分为训练集和测试集。它首先加载保存的npz文件,并将其存储在变量a中。然后,使用np.random.permutation函数对数据进行随机打乱。接着,将数据的80%作为训练集,并使用np.savez_compressed函数将其保存为压缩的npz文件。最后,使用一个循环,每次取数据的80%的1/16作为训练集,并使用np.savez_compressed函数将其保存为压缩的npz文件,直到所有训练集数据都保存完毕。如果最后一次迭代的数据不足1/16,则将剩余的数据作为一个训练集并保存。
import warnings
import numpy as np
from matplotlib import pyplot as plt
from pyts.image import GramianAngularField, MarkovTransitionField
from sklearn.metrics.pairwise import pairwise_distances
import pandas as pd
from torchvision import datasets, transforms
import h5py
from PIL import Image
from glob import glob
from tqdm import tqdm
warnings.filterwarnings("ignore")
def get_mtf(x, size=10):
q = np.vectorize(value_to_quantile)(x)
r = np.zeros((q.shape[0], q.shape[0]))
y = np.zeros((size, size))
for i in range(x.shape[0] - 1):
y[q[i], q[i + 1]] += 1
y = y / y.sum(axis=1, keepdims=True)
y[np.isnan(y)] = 0
for i in range(r.shape[0]):
for j in range(r.shape[1]):
r[i, j] = y[q[i], q[j]]
return r / 5. - 1
def recurrence_plot(s, eps=None, steps=None):
if eps==None: eps=0.1
if steps==None: steps=10
d = pairwise_distances(s[:, None])
d = d / eps
d[d > steps] = steps
return d/5. - 1
def get_quantiles(min_value=0, max_val=1, k=10):
c = (max_val - min_value)/k
b = min_value + c
d = []
for i in range(1, k):
d.append(b)
b += c
d.append(max_val)
return d
quantiles = get_quantiles()
def value_to_quantile(x):
for i, k in enumerate(quantiles):
if x <= k:
return i
return 0
def to_pickle_tain():
path=glob("/train_*.npz")
for path_id, path_one in enumerate(tqdm(path)):
df_hb_train_data = np.load(path_one)["arr_0"]
for df_id in range(0,df_hb_train_data.shape[0]):
df_id_end = min(df_id+1,df_hb_train_data.shape[0])
df_hb_train= df_hb_train_data[df_id:df_id_end]
pd.to_pickle(df_hb_train,'data/train_{}_{}.pkl'.format(path_id,df_id))
def to_pickle_test():
df_hb_train_data = np.load("test.npz")["arr_0"]
for df_id in range(0, df_hb_train_data.shape[0]):
df_id_end = min(df_id + 1, df_hb_train_data.shape[0])
df_hb_train = df_hb_train_data[df_id:df_id_end]
pd.to_pickle(df_hb_train,
'/test_{}.pkl'.format( df_id))
def get_data(path ):
df_hb_test = pd.read_pickle(path)
out_list=[]
for ch in range(3):
x_test = df_hb_test[:,:187,ch]
y_test = df_hb_test[:,187,ch].astype(int)
gasf = GramianAngularField(image_size=150, method='difference')
x_gasf_test_3 = gasf.transform(x_test)
# 3 channels, this time with GAF, RP and MTF
transform = transforms.Compose([transforms.Resize((224, 224))])
for i, (x1, x2, y) in enumerate(zip(x_gasf_test_3, x_test, y_test)):
image = Image.fromarray(x1.astype(float))
t = np.array(transform(image))
r = recurrence_plot(x2, steps=10)
mtf = get_mtf(x2)
image = Image.fromarray(r.astype(float))
t2 = np.array(transform(image))
image = Image.fromarray(mtf.astype(float))
t3 = np.array(transform(image))
out=np.concatenate([t.reshape([1,224,224]),t2.reshape([1,224,224]),t3.reshape([1,224,224])],0)
out_list.append(out)
return np.concatenate(out_list),y
if __name__ == '__main__':
# get_data("")
to_pickle_tain()
to_pickle_test()
这段代码主要包含了一些用于时间序列数据处理和特征提取的函数。
首先导入了一些必要的库和模块,包括warnings、numpy、matplotlib、pyts、sklearn、pandas、torchvision、h5py、PIL和glob。
然后定义了一些用于特征提取的函数,包括get_mtf函数用于计算Markov Transition Field(MTF)、recurrence_plot函数用于计算Recurrence Plot(RP)、get_quantiles函数用于获取分位数、value_to_quantile函数用于将数值映射到对应的分位数。
接下来定义了两个函数to_pickle_train和to_pickle_test,用于将原始数据转换为pickle格式,并保存到磁盘上。
最后定义了一个get_data函数,用于从pickle文件中读取数据,并进行特征提取。其中使用了Gramian Angular Field(GAF)、RP和MTF三种方法来提取特征,并保存到一个numpy数组中。
在if name == 'main’条件下,调用了to_pickle_train和to_pickle_test函数来将原始数据转换为pickle格式,并保存到磁盘上。
from glob import glob
import torch
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import h5py
import itertools
import numpy as np
from torch.utils.data.dataset import Dataset
import torch.nn as nn
from torchvision.models import alexnet, vgg16, resnet152, resnet18, vgg19
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import warnings
from tqdm import tqdm
from bb import get_data
warnings.filterwarnings("ignore")
# Function for moving tensor or model to GPU
def cuda(xs):
if torch.cuda.is_available():
if not isinstance(xs, (list, tuple)):
return xs.cuda()
else:
return [x.cuda() for x in xs]
else:
return xs
# Custom class for defining dataset for training with augmentation
class Dataset_Hdf5_3C(Dataset):
def __init__(self, path, data_type, ch=[0, 1, 2, 3, 4, 5, 6, 7, 8]):
""" Intialize the dataset
"""
self.path = path
self.images = []
self.ch = ch
self.len = len(path)
# You must override __getitem__ and __len__
def __getitem__(self, index):
""" Get a sample from the dataset
"""
# unsqueeze adds dimension to image -> converts to 1x224x224 since we don't have rgb
input_data,label=get_data(self.path[index])
one = torch.tensor(input_data.astype('float32'))
out = one[self.ch]
return out, torch.tensor([int(label)], dtype=torch.long).reshape(-1)
def __len__(self):
"""
Total number of samples in the dataset
"""
return self.len
def train(net, train_loader, criterion, optimizer, test_loader, num_epochs=200):
net.train()
# train_acc_max = 0
# test_acc_max = 0
recall_max = 0
f1_max = 0
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
bar= tqdm(range(num_epochs))
for epoch in bar: # loop over the dataset multiple times
net.train()
total = 0
correct = 0
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = cuda(data)
labels = labels.reshape(-1)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
# outputs = net(inputs.expand(-1, 3, -1, -1)) # converting to RGB, when we need single channel
outputs = net(inputs) # already in RGB, when we need 3 channels
loss = criterion(outputs, labels)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
bar.set_description('Epoch: {} Loss: {:.4f} Train Acc: {:.4f}'.format(epoch + 1, loss.item(), correct / total))
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
scheduler.step()
print('End of epoch {}, Loss {}'.format(epoch + 1, running_loss / len(train_loader)))
train_acc = correct / total
print('Train accuracy: {}'.format(train_acc))
test_acc, all_true, all_pred = test(net, test_loader)
print('Test accuracy: {}'.format(test_acc))
print(classification_report(all_true, all_pred))
recall = recall_score(all_true, all_pred, average='macro')
f1 = f1_score(all_true, all_pred, average='macro')
# Saving best checkpoint based on performance on test data
if recall > recall_max:
recall_max = recall
save_checkpoint(epoch + 1, net, optimizer, train_acc, test_acc, recall, f1, 'res_net',
'mtf_gaf_rp_test_recall')
if f1 > f1_max:
f1_max = f1
save_checkpoint(epoch + 1, net, optimizer, train_acc, test_acc, recall, f1, 'res_net', 'mtf_gaf_rp_test_f1')
print('Finished Training')
def test(net, test_loader):
net.eval()
net.cuda()
correct = 0
total = 0
all_true = []
all_pred = []
with torch.no_grad():
for i, data in enumerate(test_loader, 0):
images, labels = cuda(data)
labels = labels.reshape(-1)
# images, labels = data
all_true.extend(labels.cpu().tolist())
# outputs = net(images.expand(-1, 3, -1, -1))
outputs = net(images)
_, predicted = torch.max(outputs, 1)
all_pred.extend(predicted.cpu().tolist())
total += labels.size(0)
correct += (predicted == labels).sum().item()
acc = correct / total
# print('Accuracy of the network on the images: %d %%' % (100 * acc))
return acc, all_true, all_pred
def save_checkpoint(epoch, net, optimizer, train_acc, test_acc, recall, f1, net_name, param):
checkpoint = {
'net': net.state_dict(),
'optimizer': optimizer.state_dict(),
'train_acc': train_acc,
'test_acc': test_acc,
'recall': recall,
'f1': f1
}
torch.save(checkpoint, '{}_{}_best.chk'.format(net_name, param))
# From sklearn
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
def compute_confusion_matrix(all_true, all_pred):
class_names = ['T1', 'T2', 'T3', 'T4', 'T5']
cnf_matrix = confusion_matrix(all_true, all_pred)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
title='Normalized confusion matrix')
plt.show()
def val_model():
ch = [0, 1, 2, 3, 4, 5, 6, 7, 8]
test_path = glob("data/test*.pkl")
hb_test_loader_bln_6 = torch.utils.data.DataLoader(
Dataset_Hdf5_3C(test_path, 'test', ch),
batch_size=256, shuffle=False)
ckpt_recall = torch.load('res_net_mtf_gaf_rp_test_recall_best.chk')
res_net_saved_recall = resnet18(pretrained=True)
for param in res_net_saved_recall.parameters():
param.requires_grad = False
num_ftrs = res_net_saved_recall.fc.in_features
res_net_saved_recall.fc = nn.Sequential(
nn.Linear(in_features=num_ftrs, out_features=256, bias=False),
nn.ReLU(),
nn.Linear(in_features=256, out_features=128, bias=True),
nn.ReLU(),
nn.Linear(in_features=128, out_features=5, bias=True))
res_net_saved_recall.conv1 = nn.Conv2d(len(ch), 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
res_net_saved_recall.load_state_dict(ckpt_recall['net'])
ckpt_f1 = torch.load('res_net_mtf_gaf_rp_test_f1_best.chk')
res_net_saved_f1 = resnet18(pretrained=True)
for param in res_net_saved_f1.parameters():
param.requires_grad = False
num_ftrs = res_net_saved_f1.fc.in_features
res_net_saved_f1.fc = nn.Sequential(
nn.Linear(in_features=num_ftrs, out_features=256, bias=False),
nn.ReLU(),
nn.Linear(in_features=256, out_features=128, bias=True),
nn.ReLU(),
nn.Linear(in_features=128, out_features=5, bias=True))
res_net_saved_f1.conv1 = nn.Conv2d(len(ch), 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
res_net_saved_f1.load_state_dict(ckpt_f1['net'])
test_acc_r, all_true_r, all_pred_r = test(res_net_saved_recall, hb_test_loader_bln_6)
# test_acc_f, all_true_f, all_pred_f = test(res_net_saved_f1, hb_test_loader_bln_6)
compute_confusion_matrix(all_true_r, all_pred_r)
# compute_confusion_matrix(all_true_f, all_pred_f)
def train_model():
ch = [0, 1, 2, 3, 4, 5, 6, 7, 8]
test_path = glob("data/test*.pkl")
train_path = glob("data/train*.pkl")
# ch=[0,1,3,4,6,7]
hb_train_loader_bln_6 = torch.utils.data.DataLoader(
Dataset_Hdf5_3C(
train_path ,
'train', ch),
batch_size=64, shuffle=True)
hb_test_loader_bln_6 = torch.utils.data.DataLoader(
Dataset_Hdf5_3C(test_path, 'test', ch),
batch_size=256, shuffle=False)
res_net = resnet18(pretrained=True)
print(res_net)
# First we freeze all layers
for param in res_net.parameters():
param.requires_grad = False # frozen layer
# Now we make later layers trainable
for param in res_net.layer4.parameters():
param.requires_grad = True
for param in res_net.layer3.parameters():
param.requires_grad = True
for param in res_net.avgpool.parameters():
param.requires_grad = True
# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = res_net.fc.in_features
# Replace 1 FC with 3 new FC
res_net.fc = nn.Sequential(
nn.Linear(in_features=num_ftrs, out_features=256, bias=False),
nn.ReLU(),
nn.Linear(in_features=256, out_features=128, bias=True),
nn.ReLU(),
nn.Linear(in_features=128, out_features=5, bias=True))
res_net.conv1 = nn.Conv2d(len(ch), 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
# Move ResNet to GPU
res_net.cuda()
class_weights = cuda(torch.tensor([1.0, 2.0, 1.0, 2.0, 1.0]))
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer_res_net = torch.optim.Adam([
{"params": res_net.layer3.parameters(), "lr": 0.0001},
{"params": res_net.layer4.parameters(), "lr": 0.0001},
{"params": res_net.avgpool.parameters(), "lr": 0.0001},
{"params": res_net.fc[0].parameters(), "lr": 0.0001},
{"params": res_net.fc[2].parameters(), "lr": 0.001},
{"params": res_net.fc[4].parameters(), "lr": 0.002},
],
lr=0.0001, betas=(0.5, 0.999))
train(res_net, hb_train_loader_bln_6, criterion, optimizer_res_net, hb_test_loader_bln_6, 10)
if __name__ == '__main__':
train_model()
val_model()