๐ ๊ณต๋ถํ๋ ์ง์ง์ํ์นด๋ ์ฒ์์ด์ง?
csv ๋ฐ์ดํฐ๋ก ๋ฏธ๋ ์์ธก LSTM ๋ชจ๋ธ ๋ง๋ค๊ธฐ ๋ณธ๋ฌธ
๐ฉ๐ป ์ธ๊ณต์ง๋ฅ (ML & DL)/Serial Data
csv ๋ฐ์ดํฐ๋ก ๋ฏธ๋ ์์ธก LSTM ๋ชจ๋ธ ๋ง๋ค๊ธฐ
์ง์ง์ํ์นด 2022. 12. 8. 13:41728x90
๋ฐ์ํ
๐ฆ ์ ์ฒ๋ฆฌ ํ๊ธฐ ์ ์ ๋ฐ์ดํฐ ๊ตฌ๊ฒฝ~
๐ธ1. ์ฌ๋ฌ ํด๋ ์์ CSV ํ์ผ ํฉ์น๊ธฐ
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math as math
%matplotlib inline
import pickle
import os
# Load the data
forder_list = os.listdir("./DATA")
forder_list
csv1 = []
csv2 = []
csv3 = []
csv4 = []
csv5 = []
csv6 = []
for forder in forder_list :
paths = "./DATA/" + forder + "/"
# print(paths)
file_path = sorted(os.listdir(paths))
# print(file_path)
csv1.append(paths + file_path[0])
csv2.append(paths + file_path[1])
csv3.append(paths + file_path[2])
csv4.append(paths + file_path[3])
csv5.append(paths + file_path[4])
csv6.append(paths + file_path[5])
๐ธ2. ๊ฐ์ csv ๋ผ๋ฆฌ ํ๋์ dataframe์ผ๋ก concat ํ๊ธฐ (axis = 0)
allData = [] # ์ฝ์ด ๋ค์ธ csvํ์ผ ๋ด์ฉ์ ์ ์ฅํ ๋น ๋ฆฌ์คํธ๋ฅผ ํ๋ ๋ง๋ ๋ค
for file in csv1:
df = pd.read_csv(file) # for๊ตฌ๋ฌธ์ผ๋ก csvํ์ผ๋ค์ ์ฝ์ด ๋ค์ธ๋ค
allData.append(df) # ๋น ๋ฆฌ์คํธ์ ์ฝ์ด ๋ค์ธ ๋ด์ฉ์ ์ถ๊ฐํ๋ค
dataCombine = pd.concat(allData, axis=0, ignore_index=True)
๐ฆ ์๊ฐํ
fig = plt.figure() #figure๊ฐ์ฒด๋ฅผ ์ ์ธํด ๋ํ์ง(๊ทธ๋ํ) ๊ฐ์ฒด ์์ฑ**
ax = fig.add_subplot() ## ๊ทธ๋ฆผ ๋ผ๋(ํ๋ ์) ์์ฑ #์ถ์ ๊ทธ๋ ค์ค**
#figure()๊ฐ์ฒด์ ํ์ ๊ทธ๋ํ subplot๋ฅผ ์ถ๊ฐ. add_subplot ๋งค์๋๋ฅผ ์ด์ฉํด ์ถ์ ๊ทธ๋ ค์ค์ผ ํจ
# 1, 1, 1์ ๋ป : nrows(ํ), nclos(์ด), index(๊ทธ๋ํ๊ฐ ๊ทธ๋ ค์ง๋ ์ขํ)
for i in range(2, 15) :
# ํน์ ์ด Plot ๋ณํ
plt.title(dataCombine.columns[i])
plt.plot(dataCombine["insert_date_time"], dataCombine.iloc[:, i], color='r')
# ์์ฑ๋ Plot ์ถ๋ ฅ
plt.show()
๋ฐ์ดํฐ๊ฐ ๋๋ฌด ๋ง์ ๊ฒ ๊ฐ์์,
์๊ฐ ๋ณ๋ก SUM ํด์ ๋ค์ ๋ฐ์ดํฐ ํฉ์ณ๋ณด๊ธฐ
๐ฆ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
๐ธ 1. ๊ฐ์ insert_data_time ๋ณ๋ก ๋ฌถ์ด์ sum ํ๊ธฐ -> ๋ฐ์ดํฐ ์ค์
dataCombine2 = dataCombine.groupby(["insert_date_time"], as_index=True).sum()
dataCombine2
๐ธ 2. ๋ฐ์ดํฐ ์ค์ผ์ผ๋ง ํ๊ธฐ ์ํด ์ค๋นํ๊ธฐ
dataCombine2_temp = dataCombine2.reset_index(drop=True)
๐ฆ ์๊ฐํ
fig = plt.figure() #figure๊ฐ์ฒด๋ฅผ ์ ์ธํด ๋ํ์ง(๊ทธ๋ํ) ๊ฐ์ฒด ์์ฑ**
ax = fig.add_subplot() ## ๊ทธ๋ฆผ ๋ผ๋(ํ๋ ์) ์์ฑ #์ถ์ ๊ทธ๋ ค์ค**
#figure()๊ฐ์ฒด์ ํ์ ๊ทธ๋ํ subplot๋ฅผ ์ถ๊ฐ. add_subplot ๋งค์๋๋ฅผ ์ด์ฉํด ์ถ์ ๊ทธ๋ ค์ค์ผ ํจ
# 1, 1, 1์ ๋ป : nrows(ํ), nclos(์ด), index(๊ทธ๋ํ๊ฐ ๊ทธ๋ ค์ง๋ ์ขํ)
for i in range(1, 14) :
# ํน์ ์ด Plot ๋ณํ
plt.title(dataCombine2_temp.columns[i])
plt.plot(dataCombine2.index, dataCombine2_temp.iloc[:, i], color='r')
# ์์ฑ๋ Plot ์ถ๋ ฅ
plt.show()
๐ฆ ๋ชจ๋ธ ๋ง๋ค๊ธฐ
๐ธ 1. 0 ๋ฐ์ดํฐ ์ง์ฐ๊ณ , ์ฃผ๊ธฐ์ฑ ๊ฐ์ง๋ ๊ทธ๋ํ๋ค๋ง ๋ชจ์์ LSTM ๋ชจ๋ธ ํ๋ จํ๊ธฐ
dataCombine3_temp = dataCombine2_temp.drop(["ํ์์๋๊ฑฐ", '0 ๋ฐ์ดํฐ ๋ฟ', '0 ๋ฐ์ดํฐ ๋ฟ', '๋ถ๊ท์น'], axis = 1)
dataCombine3_temp
๐ธ 2. ์ ๊ทํ
!pip install -U sklearn
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scale_cols = dataCombine3_temp.columns
dataCombine3_temp_scaled = scaler.fit_transform(dataCombine3_temp)
dataCombine3_temp_scaled = pd.DataFrame(dataCombine3_temp_scaled)
dataCombine3_temp_scaled.columns = scale_cols
dataCombine3_temp_scaled
๐ธ 3. ์๊ณ์ด ๋ฐ์ดํฐ์ ๋ฐ์ดํฐ์ ๋ถ๋ฆฌ
- ์๊ณ์ด ๋ฐ์ดํฐ์ ๋ฐ์ดํฐ์
์ ๋ณดํต window_size
- window_size๋ ๊ณผ๊ฑฐ ๊ธฐ๊ฐ์ ์ฃผ๊ฐ ๋ฐ์ดํฐ์ ๊ธฐ๋ฐํ์ฌ ๋ค์๋ ์ ์ข ๊ฐ๋ฅผ ์์ธกํ ๊ฒ์ธ๊ฐ๋ฅผ ์ ํ๋ parameter
- ๋ง์ฝ ๊ณผ๊ฑฐ 20์ผ์ ๊ธฐ๋ฐ์ผ๋ก ๋ด์ผ ๋ฐ์ดํฐ๋ฅผ ์์ธกํ๋ค๋ผ๊ณ ๊ฐ์ ํ๋ฉด window_size=20์ด ๋๋ ๊ฒ
TEST_SIZE = 200
WINDOW_SIZE = 20
train = dataCombine3_temp_scaled[:-TEST_SIZE]
test = dataCombine3_temp_scaled[-TEST_SIZE:]
def make_dataset(data, label, window_size=20):
feature_list = []
label_list = []
for i in range(len(data) - window_size):
feature_list.append(np.array(data.iloc[i:i+window_size]))
label_list.append(np.array(label.iloc[i+window_size]))
return np.array(feature_list), np.array(label_list)
๐ธ 4. ํ๋ จ, ํ ์คํธ ๋ฐ์ดํฐ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
feature_cols = ['routegroupque', 'cnt_fastque', 'cnt_slowque', 'cnt_dbque',
'cnt_waitackmap', 'cnt_success', 'cnt_invalid', 'cnt_timeout',
'cnt_resent', 'cnt_onemin']
label_cols = ['cnt_timeout']
train_feature = train[feature_cols]
train_label = train[label_cols]
train_feature, train_label = make_dataset(train_feature, train_label, 20)
x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)
x_train.shape, x_valid.shape
test_feature = test[feature_cols]
test_label = test[label_cols]
test_feature.shape, test_label.shape
test_feature, test_label = make_dataset(test_feature, test_label, 20)
test_feature.shape, test_label.shape
๐ธ 5. LSTM ๋ชจ๋ธ
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM
model = Sequential()
model.add(LSTM(16,
input_shape=(train_feature.shape[1], train_feature.shape[2]),
activation='relu',
return_sequences=False)
)
model.add(Dense(1))
import os
model.compile(loss='mean_squared_error', optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
model_path = 'model'
filename = os.path.join(model_path, 'tmp_checkpoint.h5')
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
history = model.fit(x_train, y_train,
epochs=200,
batch_size=16,
validation_data=(x_valid, y_valid),
callbacks=[early_stop, checkpoint])
model.load_weights(filename)
pred = model.predict(test_feature)
pred.shape
plt.figure(figsize=(12, 9))
plt.plot(test_label, label = 'actual')
plt.plot(pred, label = 'prediction')
plt.legend()
plt.show()
728x90
๋ฐ์ํ
'๐ฉโ๐ป ์ธ๊ณต์ง๋ฅ (ML & DL) > Serial Data' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
Comments