๐ ๊ณต๋ถํ๋ ์ง์ง์ํ์นด๋ ์ฒ์์ด์ง?
[Kaggle] Chest X-Ray ํ์ ์ด๋ฏธ์ง ๋ถ๋ฅํ๊ธฐ ๋ณธ๋ฌธ
๐ฉ๐ป ์ปดํจํฐ ๊ตฌ์กฐ/Kaggle
[Kaggle] Chest X-Ray ํ์ ์ด๋ฏธ์ง ๋ถ๋ฅํ๊ธฐ
์ง์ง์ํ์นด 2022. 1. 29. 18:56728x90
๋ฐ์ํ
220129 ์์ฑ
<๋ณธ ๋ธ๋ก๊ทธ๋ Kaggle ์ ์ฐธ๊ณ ํด์ ๊ณต๋ถํ๋ฉฐ ์์ฑํ์์ต๋๋ค>
1. ๋ผ์ด๋ธ๋ฌ๋ฆฌ import ๋ฐ ๋ฐ์ดํฐ load
import os, re
import random, math
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
# ๋ฐ์ดํฐ ๋ก๋ ๋น ๋ฅด๊ฒ
AUTOTUNE = tf.data.experimental.AUTOTUNE
# X-ray ์ด๋ฏธ์ง ์ฌ์ด์ฆ
IMAGE_SIZE = [180, 180]
- ๋ฐ์ดํฐ ๊ฒฝ๋ก
root_path = os.getcwd() + '\\'
train_path = root_path + "chest_xray\\train\\*\\*"
val_path = root_path + "chest_xray\\val\\*\\*"
test_path = root_path + "chest_xray\\test\\*\\*"
BATCH_SIZE = 30 # ์ ํ ๋งํผ์ ๋ฐฐ์น
EPOCHS = 50
2. ๋ฐ์ดํฐ ์ค๋น
- train_data : 5216
- test_data : 624
- val_data : 16
train_data = tf.io.gfile.glob(train_path)
test_data = tf.io.gfile.glob(test_path)
val_data = tf.io.gfile.glob(val_path)
print(len(train_data))
print(len(test_data))
print(len(val_data))
-
val ๋๋ฌด ์์ผ๋ train ๊ณผ val ํฉ์น๊ณ ๋ค์ 8 : 2
train_val_sum = tf.io.gfile.glob(train_path)
train_val_sum.extend(tf.io.gfile.glob(val_path))
# 8:2
train_size = math.floor(len(train_val_sum)*0.8)
random.shuffle(train_val_sum) # shuffle : ๊ณ ์ ํฌ๊ธฐ ๋ฒํผ๋ฅผ ์ ์งํ๊ณ ๋ฎค์์๋ก ๊ท ์ผ
train = train_val_sum[:train_size]
val = train_val_sum[train_size:]
print(len(train))
print(len(val))
- train : 4185
- val : 1047
- ํ๋ ด ์๋ normal : 1077
- ํ๋ ด : 3108
normal = len([filename for filename in train if "NORMAL" in filename])
print(f"normal image count in train set : {normal}")
pneumonia = len([filename for filename in train if "PNEUMONIA" in filename])
print(f"pneumonia image count in train set : {pneumonia}")
-
tf.data.Dataset.from_tensor_slices ํจ์๋ tf.data.Dataset ๋ฅผ ์์ฑํ๋ ํจ์
train_list_ds = tf.data.Dataset.from_tensor_slices(train)
val_list_ds = tf.data.Dataset.from_tensor_slices(val)
TRAIN_IMG_COUNT = tf.data.experimental.cardinality(train_list_ds).numpy()
print(f"Training images count: {TRAIN_IMG_COUNT}")
VAL_IMG_COUNT = tf.data.experimental.cardinality(val_list_ds).numpy()
print(f"Validating images count: {VAL_IMG_COUNT}")
-
'NORMAL'์ด๋ 'PNEUMONIA'๊ฐ ํฌํจ๋์ด ์๊ธฐ ๋๋ฌธ์ ์ด๋ฅผ ์ด์ฉํด์ ๋ผ๋ฒจ ๋ฐ์ดํฐ
# ํ์ผ ๊ฒฝ๋ก์ ๋์์ ๋๋ฒ์งธ ๋ถ๋ถ์ผ๋ก ์์ฑ๊ณผ ์์ฑ์ ๊ตฌ๋ถ
def get_label(file_path):
parts = tf.strings.split(file_path, os.path.sep)
return parts[-2] == "PNEUMONIA" # ํ๋ ด์ด๋ฉด ์์ฑ(True), ๋
ธ๋ง์ด๋ฉด ์์ฑ(False)
- ์ด๋ฏธ์ง ์ฌ์ด์ฆ ์ค์ด๊ธฐ
def decode_img(img):
img = tf.image.decode_jpeg(img, channels=3) # ์ด๋ฏธ์ง๋ฅผ uint8 tensor๋ก ์์
img = tf.image.convert_image_dtype(img, tf.float32) # float32 ํ์
์ผ๋ก ์์
img = tf.image.resize(img, IMAGE_SIZE) # ์ด๋ฏธ์ง ์ฌ์ด์ฆ๋ฅผ [180, 180] ์์
return img
- ์ด๋ฏธ์ง, ๋ผ๋ฒจ ์ฝ๊ธฐ
def process_path(file_path):
label = get_label(file_path) # ๋ผ๋ฒจ ๊ฒ์ถ
img = tf.io.read_file(file_path) # ์ด๋ฏธ์ง ์ฝ๊ธฐ
img = decode_img(img) # ์ด๋ฏธ์ง๋ฅผ ์๋ง์ ํ์์ผ๋ก ์์
return img, label
- train, validation ๋ฐ์ดํฐ ์ ๋ง๋ค๊ธฐ
# ๋น ๋ฅด๊ฒ ๋ฐ์ดํฐ ์ฒ๋ฆฌ
train_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
- ์ด๋ฏธ์ง ํ์ธ
for image, label in train_ds.take(1):
print("Image shape: ", image.numpy().shape)
print("Label: ", label.numpy())
- test ๋ ๋์ผํ๊ฒ
test_list_ds = tf.data.Dataset.list_files(test_path)
TEST_IMAGE_COUNT = tf.data.experimental.cardinality(test_list_ds).numpy()
test_ds = test_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE)
for image, label in test_ds.take(1):
print("Image shape: ", image.numpy().shape)
print("Label: ", label.numpy())
print(TEST_IMAGE_COUNT)
- data ๋ฅผ ์ข๋ ํจ์จ์ ์ผ๋ก
# random_flip_left_right : ๋๋คํ๊ฒ ์ข์ฐ๋ฅผ ๋ฐ์
def augment(image,label):
image = tf.image.random_flip_left_right(image)
return image,label
def prepare_for_training(ds, shuffle_buffer_size=1000):
ds = ds.map(
augment, # augment ํจ์ ์ ์ฉ
num_parallel_calls=2
)
# shuffle : ๊ณ ์ ํฌ๊ธฐ ๋ฒํผ๋ฅผ ์ ์งํ๊ณ ๋ฎค์์๋ก ๊ท ์ผํ๊ฒ ๋ค์ ์์๋ฅผ ์ ํ
ds = ds.shuffle(buffer_size=shuffle_buffer_size)
# repeat : ์ฌ๋ฌ ๋ฒ ๋ฐ์ดํฐ ๋ถ๋ฅด๊ธฐ
ds = ds.repeat()
# batch : ์ ํ ๋งํผ์ ๋ฐฐ์น
ds = ds.batch(BATCH_SIZE)
# prefech : GPU์ CPU๋ฅผ ํจ์จ์ ์ผ๋ก ์ฌ์ฉ
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
train_ds = prepare_for_training(train_ds)
val_ds = prepare_for_training(val_ds)
3. ๋ฐ์ดํฐ ์๊ฐํ
# ์ด๋ฏธ์ง ๋ฐฐ์น๋ฅผ ์
๋ ฅํ๋ฉด ์ฌ๋ฌ์ฅ์ ์ด๋ฏธ์ง๋ฅผ ๋ณด์ฌ์ค
def show_batch(image_batch, label_batch):
plt.figure(figsize=(10,10))
for n in range(BATCH_SIZE): # BATCH_SIZE = 30
ax = plt.subplot(5,math.ceil(BATCH_SIZE / 5),n+1)
plt.imshow(image_batch[n])
if label_batch[n]:
plt.title("PNEUMONIA")
else:
plt.title("NORMAL")
plt.axis("off")
image_batch, label_batch = next(iter(train_ds))
show_batch(image_batch.numpy(), label_batch.numpy())
4. CNN ๋ชจ๋ธ๋ง
: CNN์ Convolution Layer์ Max Pooling ๋ ์ด์ด๋ฅผ ๋ฐ๋ณต์ ์ผ๋ก stack์ ์๋ ํน์ง ์ถ์ถ(Feature Extraction)
: Fully Connected Layer๋ฅผ ๊ตฌ์ฑํ๊ณ ๋ง์ง๋ง ์ถ๋ ฅ์ธต์ Softmax๋ฅผ ์ ์ฉํ ๋ถ๋ฅ ๋ถ๋ถ์ผ๋ก ๋๋๋ค
- Convolution Block
1) conv 2 ๋ฒ
2) batch normal ๋ก ๊ณผ์ ํฉ ๋ฐฉ์ง
3) max pooling
def conv_block(filters):
block = tf.keras.Sequential([
tf.keras.layers.SeparableConv2D(filters, 3, activation='relu', padding='same'),
tf.keras.layers.SeparableConv2D(filters, 3, activation='relu', padding='same'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPool2D()
])
return block
- dense block
def dense_block(units, dropout_rate):
block = tf.keras.Sequential([
tf.keras.layers.Dense(units, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(dropout_rate)
])
return block
- ์ ์ฒด ๋ชจ๋ธ
def build_model():
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
tf.keras.layers.MaxPool2D(),
conv_block(32),
conv_block(64),
conv_block(128),
tf.keras.layers.Dropout(0.2),
conv_block(256),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Flatten(),
dense_block(512, 0.7),
dense_block(128, 0.5),
dense_block(64, 0.3),
tf.keras.layers.Dense(1, activation='sigmoid')
])
return model
5. ๋ฐ์ดํฐ ๋ถ๊ท ํ ์ฒ๋ฆฌ
- ๋๋ค ํฌ๋ ์คํธ๊ฐ ์ข๋ค
weight_for_0 = (1 / normal)*(TRAIN_IMG_COUNT)/2.0
weight_for_1 = (1 / pneumonia)*(TRAIN_IMG_COUNT)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print('Weight for NORMAL: {:.2f}'.format(weight_for_0))
print('Weight for PNEUMONIA: {:.2f}'.format(weight_for_1))
6. ๋ชจ๋ธ ํ๋ จ
with tf.device('/GPU:0'):
model = build_model()
METRICS = [
'accuracy',
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall')
]
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=METRICS
)
- EarlyStopping
from keras.callbacks import EarlyStopping
# ์ต๊ณ ์ ์ ํ๋๋ฅผ ๊ฐ์ง๋ ๋ฉ์ถฐ์ฃผ๋ ํจ์ ์ถ๊ฐ
es=EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=10)
history = model.fit(
train_ds,
steps_per_epoch = TRAIN_IMG_COUNT // BATCH_SIZE,
epochs=10, callbacks=[es],
validation_data = val_ds,
validation_steps = VAL_IMG_COUNT // BATCH_SIZE,
class_weight = class_weight,
)
7. ์๊ฐํ
fig, ax = plt.subplots(1, 4, figsize=(20, 5))
ax = ax.ravel()
for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']):
ax[i].plot(history.history[met])
ax[i].plot(history.history['val_' + met])
ax[i].set_title('Model {}'.format(met))
ax[i].set_xlabel('epochs')
ax[i].set_ylabel(met)
ax[i].legend(['train', 'val'])
- test ๋ฐ์ดํฐ๋ก ๋ชจ๋ธ ํ๊ฐ
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "https://miro.medium.com/proxy/1*pOtBHai4jFd-ujaNXPilRg.png")
Precision ( ์ ๋ฐ๋ ) ex) ํ์ง์๋ก ๋ถ๋ฅ๋ ์ฌ๋๋ค ์ค ์ค์ ์์ฑ ์๋ฏธ๋ ํ๋ฅ
Recall ( ์ฌํ์จ ) ex) ์ค์ ๋ก ์์ฑ์ธ ์๋ฏผ์ ํ์ง์๋ก ๋ถ๋ฅํ ํ๋ฅ
loss, accuracy, precision, recall = model.evaluate(test_ds)
print(f'Loss: {loss},\nAccuracy: {accuracy},\nPrecision: {precision},\nRecall: {recall}')
- Precision ๊ณผ Recall ์ trade-off ๊ด๊ณ
728x90
๋ฐ์ํ
'๐ฉโ๐ป ์ปดํจํฐ ๊ตฌ์กฐ > Kaggle' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[Kaggle]Super Image Resolution_๊ณ ํ์ง ์ด๋ฏธ์ง ๋ง๋ค๊ธฐ (0) | 2022.02.07 |
---|---|
[Kaggle] CNN Architectures (0) | 2022.02.04 |
[Kaggle] HeartAttack ์์ธก (0) | 2022.01.31 |
[Kaggle]Breast Cancer Wisconsin (Diagnostic) Data Set_์ ๋ฐฉ์ ๋ถ๋ฅ (0) | 2022.01.28 |
[Kaggle]MBTI_Myers-Briggs Personality Type Dataset(์ฑ๊ฒฉ์ฐ๊ตฌ) (0) | 2022.01.22 |
Comments