๐Ÿ˜Ž ๊ณต๋ถ€ํ•˜๋Š” ์ง•์ง•์•ŒํŒŒ์นด๋Š” ์ฒ˜์Œ์ด์ง€?

csv ๋ฐ์ดํ„ฐ๋กœ ๋ฏธ๋ž˜ ์˜ˆ์ธก LSTM ๋ชจ๋ธ ๋งŒ๋“ค๊ธฐ ๋ณธ๋ฌธ

๐Ÿ‘ฉ‍๐Ÿ’ป ์ธ๊ณต์ง€๋Šฅ (ML & DL)/Serial Data

csv ๋ฐ์ดํ„ฐ๋กœ ๋ฏธ๋ž˜ ์˜ˆ์ธก LSTM ๋ชจ๋ธ ๋งŒ๋“ค๊ธฐ

์ง•์ง•์•ŒํŒŒ์นด 2022. 12. 8. 13:41
728x90
๋ฐ˜์‘ํ˜•

๐Ÿฆž ์ „์ฒ˜๋ฆฌ ํ•˜๊ธฐ ์ „์— ๋ฐ์ดํ„ฐ ๊ตฌ๊ฒฝ~

๐Ÿธ1. ์—ฌ๋Ÿฌ ํด๋” ์•ˆ์— CSV ํŒŒ์ผ ํ•ฉ์น˜๊ธฐ

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math as math
%matplotlib inline

import pickle
import os
# Load the data
forder_list = os.listdir("./DATA")
forder_list

csv1 = []
csv2 = []
csv3 = []
csv4 = []
csv5 = []
csv6 = []

for forder in forder_list :
    paths = "./DATA/" + forder + "/"
    # print(paths)
    file_path = sorted(os.listdir(paths))
    # print(file_path)
    
    csv1.append(paths + file_path[0])
    csv2.append(paths + file_path[1])
    csv3.append(paths + file_path[2])
    csv4.append(paths + file_path[3])
    csv5.append(paths + file_path[4])
    csv6.append(paths + file_path[5])

 

๐Ÿธ2. ๊ฐ™์€ csv ๋ผ๋ฆฌ ํ•˜๋‚˜์˜ dataframe์œผ๋กœ concat ํ•˜๊ธฐ (axis = 0)

allData = [] # ์ฝ์–ด ๋“ค์ธ csvํŒŒ์ผ ๋‚ด์šฉ์„ ์ €์žฅํ•  ๋นˆ ๋ฆฌ์ŠคํŠธ๋ฅผ ํ•˜๋‚˜ ๋งŒ๋“ ๋‹ค

for file in csv1:
    df = pd.read_csv(file) # for๊ตฌ๋ฌธ์œผ๋กœ csvํŒŒ์ผ๋“ค์„ ์ฝ์–ด ๋“ค์ธ๋‹ค
    allData.append(df) # ๋นˆ ๋ฆฌ์ŠคํŠธ์— ์ฝ์–ด ๋“ค์ธ ๋‚ด์šฉ์„ ์ถ”๊ฐ€ํ•œ๋‹ค
dataCombine = pd.concat(allData, axis=0, ignore_index=True)

 

 

๐Ÿฆž ์‹œ๊ฐํ™”

fig = plt.figure()   #figure๊ฐ์ฒด๋ฅผ ์„ ์–ธํ•ด ๋„ํ™”์ง€(๊ทธ๋ž˜ํ”„) ๊ฐ์ฒด ์ƒ์„ฑ**
ax = fig.add_subplot() ## ๊ทธ๋ฆผ ๋ผˆ๋Œ€(ํ”„๋ ˆ์ž„) ์ƒ์„ฑ  #์ถ•์„ ๊ทธ๋ ค์คŒ**
#figure()๊ฐ์ฒด์— ํ•˜์œ„ ๊ทธ๋ž˜ํ”„ subplot๋ฅผ ์ถ”๊ฐ€. add_subplot ๋งค์„œ๋“œ๋ฅผ ์ด์šฉํ•ด ์ถ•์„ ๊ทธ๋ ค์ค˜์•ผ ํ•จ
# 1, 1, 1์˜ ๋œป : nrows(ํ–‰), nclos(์—ด), index(๊ทธ๋ž˜ํ”„๊ฐ€ ๊ทธ๋ ค์ง€๋Š” ์ขŒํ‘œ)


for i in range(2, 15) :
    # ํŠน์ •์—ด Plot ๋ณ€ํ™˜
    plt.title(dataCombine.columns[i])
    plt.plot(dataCombine["insert_date_time"], dataCombine.iloc[:, i], color='r')

    # ์ƒ์„ฑ๋œ Plot ์ถœ๋ ฅ
    plt.show()

 

๋ฐ์ดํ„ฐ๊ฐ€ ๋„ˆ๋ฌด ๋งŽ์€ ๊ฒƒ ๊ฐ™์•„์„œ,

์‹œ๊ฐ„ ๋ณ„๋กœ SUM ํ•ด์„œ ๋‹ค์‹œ ๋ฐ์ดํ„ฐ ํ•ฉ์ณ๋ณด๊ธฐ

 

๐Ÿฆž ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

๐Ÿธ 1. ๊ฐ™์€ insert_data_time ๋ณ„๋กœ ๋ฌถ์–ด์„œ sum ํ•˜๊ธฐ -> ๋ฐ์ดํ„ฐ ์ค„์ž„

dataCombine2 = dataCombine.groupby(["insert_date_time"], as_index=True).sum()
dataCombine2

 

 

๐Ÿธ 2. ๋ฐ์ดํ„ฐ ์Šค์ผ€์ผ๋ง ํ•˜๊ธฐ ์œ„ํ•ด ์ค€๋น„ํ•˜๊ธฐ

dataCombine2_temp = dataCombine2.reset_index(drop=True)

 

๐Ÿฆž ์‹œ๊ฐํ™”

fig = plt.figure()   #figure๊ฐ์ฒด๋ฅผ ์„ ์–ธํ•ด ๋„ํ™”์ง€(๊ทธ๋ž˜ํ”„) ๊ฐ์ฒด ์ƒ์„ฑ**
ax = fig.add_subplot() ## ๊ทธ๋ฆผ ๋ผˆ๋Œ€(ํ”„๋ ˆ์ž„) ์ƒ์„ฑ  #์ถ•์„ ๊ทธ๋ ค์คŒ**
#figure()๊ฐ์ฒด์— ํ•˜์œ„ ๊ทธ๋ž˜ํ”„ subplot๋ฅผ ์ถ”๊ฐ€. add_subplot ๋งค์„œ๋“œ๋ฅผ ์ด์šฉํ•ด ์ถ•์„ ๊ทธ๋ ค์ค˜์•ผ ํ•จ
# 1, 1, 1์˜ ๋œป : nrows(ํ–‰), nclos(์—ด), index(๊ทธ๋ž˜ํ”„๊ฐ€ ๊ทธ๋ ค์ง€๋Š” ์ขŒํ‘œ)


for i in range(1, 14) :
    # ํŠน์ •์—ด Plot ๋ณ€ํ™˜
    plt.title(dataCombine2_temp.columns[i])
    plt.plot(dataCombine2.index, dataCombine2_temp.iloc[:, i], color='r')

    # ์ƒ์„ฑ๋œ Plot ์ถœ๋ ฅ
    plt.show()

 

 

๐Ÿฆž ๋ชจ๋ธ ๋งŒ๋“ค๊ธฐ

๐Ÿธ 1. 0 ๋ฐ์ดํ„ฐ ์ง€์šฐ๊ณ , ์ฃผ๊ธฐ์„ฑ ๊ฐ€์ง€๋Š” ๊ทธ๋ž˜ํ”„๋“ค๋งŒ ๋ชจ์•„์„œ LSTM ๋ชจ๋ธ ํ›ˆ๋ จํ•˜๊ธฐ 

dataCombine3_temp = dataCombine2_temp.drop(["ํ•„์š”์—†๋Š”๊ฑฐ", '0 ๋ฐ์ดํ„ฐ ๋ฟ', '0 ๋ฐ์ดํ„ฐ ๋ฟ', '๋ถˆ๊ทœ์น™'], axis = 1)
dataCombine3_temp

 

 

 

 

๐Ÿธ 2. ์ •๊ทœํ™”

!pip install -U sklearn

 

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scale_cols = dataCombine3_temp.columns
dataCombine3_temp_scaled = scaler.fit_transform(dataCombine3_temp)
dataCombine3_temp_scaled = pd.DataFrame(dataCombine3_temp_scaled)
dataCombine3_temp_scaled.columns = scale_cols

dataCombine3_temp_scaled

 

 

๐Ÿธ 3. ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ์˜ ๋ฐ์ดํ„ฐ์…‹ ๋ถ„๋ฆฌ

  • ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ์˜ ๋ฐ์ดํ„ฐ์…‹์€ ๋ณดํ†ต window_size
    • window_size๋Š” ๊ณผ๊ฑฐ ๊ธฐ๊ฐ„์˜ ์ฃผ๊ฐ€ ๋ฐ์ดํ„ฐ์— ๊ธฐ๋ฐ˜ํ•˜์—ฌ ๋‹ค์Œ๋‚ ์˜ ์ข…๊ฐ€๋ฅผ ์˜ˆ์ธกํ•  ๊ฒƒ์ธ๊ฐ€๋ฅผ ์ •ํ•˜๋Š” parameter
  • ๋งŒ์•ฝ ๊ณผ๊ฑฐ 20์ผ์„ ๊ธฐ๋ฐ˜์œผ๋กœ ๋‚ด์ผ ๋ฐ์ดํ„ฐ๋ฅผ ์˜ˆ์ธกํ•œ๋‹ค๋ผ๊ณ  ๊ฐ€์ •ํ•˜๋ฉด window_size=20์ด ๋˜๋Š” ๊ฒƒ
TEST_SIZE = 200
WINDOW_SIZE = 20

train = dataCombine3_temp_scaled[:-TEST_SIZE]
test = dataCombine3_temp_scaled[-TEST_SIZE:]
def make_dataset(data, label, window_size=20):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

 

 

๐Ÿธ 4. ํ›ˆ๋ จ, ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ ๋‚˜๋ˆ„๊ธฐ

from sklearn.model_selection import train_test_split

feature_cols = ['routegroupque', 'cnt_fastque', 'cnt_slowque', 'cnt_dbque',
       'cnt_waitackmap', 'cnt_success', 'cnt_invalid', 'cnt_timeout',
       'cnt_resent', 'cnt_onemin']
label_cols = ['cnt_timeout']

train_feature = train[feature_cols]
train_label = train[label_cols]

train_feature, train_label = make_dataset(train_feature, train_label, 20)

x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)
x_train.shape, x_valid.shape

test_feature = test[feature_cols]
test_label = test[label_cols]

test_feature.shape, test_label.shape

test_feature, test_label = make_dataset(test_feature, test_label, 20)
test_feature.shape, test_label.shape

 

๐Ÿธ 5. LSTM ๋ชจ๋ธ

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM

model = Sequential()
model.add(LSTM(16, 
               input_shape=(train_feature.shape[1], train_feature.shape[2]), 
               activation='relu', 
               return_sequences=False)
          )

model.add(Dense(1))
import os

model.compile(loss='mean_squared_error', optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)

model_path = 'model'
filename = os.path.join(model_path, 'tmp_checkpoint.h5')
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

history = model.fit(x_train, y_train, 
                                    epochs=200, 
                                    batch_size=16,
                                    validation_data=(x_valid, y_valid), 
                                    callbacks=[early_stop, checkpoint])

 

model.load_weights(filename)
pred = model.predict(test_feature)

pred.shape

 

plt.figure(figsize=(12, 9))
plt.plot(test_label, label = 'actual')
plt.plot(pred, label = 'prediction')
plt.legend()
plt.show()

 

 

 

 

728x90
๋ฐ˜์‘ํ˜•
Comments