๐ ๊ณต๋ถํ๋ ์ง์ง์ํ์นด๋ ์ฒ์์ด์ง?
[Kaggle] Chest X-Ray ํ์ ์ด๋ฏธ์ง ๋ถ๋ฅํ๊ธฐ ๋ณธ๋ฌธ
[Kaggle] Chest X-Ray ํ์ ์ด๋ฏธ์ง ๋ถ๋ฅํ๊ธฐ
์ง์ง์ํ์นด 2022. 1. 29. 18:56220129 ์์ฑ
<๋ณธ ๋ธ๋ก๊ทธ๋ Kaggle ์ ์ฐธ๊ณ ํด์ ๊ณต๋ถํ๋ฉฐ ์์ฑํ์์ต๋๋ค>
Search | Kaggle
www.kaggle.com
ํ์ ์ด๋ฏธ์ง ๋ถ๋ฅํ๊ธฐ
์บ๊ธ์ Chest X-Ray Images ์ฌ์ฉํ๋ ด์ ํ์ ์ผ์ฆ์ด ์๊ธด ์ํ๋ก ์ค์ฆ์ ํธํก๊ธฐ ๊ฐ์ผ๋ณ์ ๋๋ค.์๋ ์ด๋ฏธ์ง์ฒ๋ผ ํ๋ถ์์ ํฌ๋ฏธํ ๊ทธ๋ฆผ์?๊ฐ์๊ฒ ๋ณด์ด๋๋ฐ ์ฌ์ค ์ด ์ฌ์ง๋ง์ผ๋ก ํ์คํ ํ๋ ด์ด๋ค ์๋๋ค
velog.io
1. ๋ผ์ด๋ธ๋ฌ๋ฆฌ import ๋ฐ ๋ฐ์ดํฐ load
import os, re
import random, math
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
# ๋ฐ์ดํฐ ๋ก๋ ๋น ๋ฅด๊ฒ
AUTOTUNE = tf.data.experimental.AUTOTUNE
# X-ray ์ด๋ฏธ์ง ์ฌ์ด์ฆ
IMAGE_SIZE = [180, 180]
- ๋ฐ์ดํฐ ๊ฒฝ๋ก
root_path = os.getcwd() + '\\'
train_path = root_path + "chest_xray\\train\\*\\*"
val_path = root_path + "chest_xray\\val\\*\\*"
test_path = root_path + "chest_xray\\test\\*\\*"
BATCH_SIZE = 30 # ์ ํ ๋งํผ์ ๋ฐฐ์น
EPOCHS = 50
2. ๋ฐ์ดํฐ ์ค๋น
- train_data : 5216
- test_data : 624
- val_data : 16
train_data = tf.io.gfile.glob(train_path)
test_data = tf.io.gfile.glob(test_path)
val_data = tf.io.gfile.glob(val_path)
print(len(train_data))
print(len(test_data))
print(len(val_data))
-
val ๋๋ฌด ์์ผ๋ train ๊ณผ val ํฉ์น๊ณ ๋ค์ 8 : 2
train_val_sum = tf.io.gfile.glob(train_path)
train_val_sum.extend(tf.io.gfile.glob(val_path))
# 8:2
train_size = math.floor(len(train_val_sum)*0.8)
random.shuffle(train_val_sum) # shuffle : ๊ณ ์ ํฌ๊ธฐ ๋ฒํผ๋ฅผ ์ ์งํ๊ณ ๋ฎค์์๋ก ๊ท ์ผ
train = train_val_sum[:train_size]
val = train_val_sum[train_size:]
print(len(train))
print(len(val))
- train : 4185
- val : 1047
- ํ๋ ด ์๋ normal : 1077
- ํ๋ ด : 3108
normal = len([filename for filename in train if "NORMAL" in filename])
print(f"normal image count in train set : {normal}")
pneumonia = len([filename for filename in train if "PNEUMONIA" in filename])
print(f"pneumonia image count in train set : {pneumonia}")

-
tf.data.Dataset.from_tensor_slices ํจ์๋ tf.data.Dataset ๋ฅผ ์์ฑํ๋ ํจ์
train_list_ds = tf.data.Dataset.from_tensor_slices(train)
val_list_ds = tf.data.Dataset.from_tensor_slices(val)
TRAIN_IMG_COUNT = tf.data.experimental.cardinality(train_list_ds).numpy()
print(f"Training images count: {TRAIN_IMG_COUNT}")
VAL_IMG_COUNT = tf.data.experimental.cardinality(val_list_ds).numpy()
print(f"Validating images count: {VAL_IMG_COUNT}")

-
'NORMAL'์ด๋ 'PNEUMONIA'๊ฐ ํฌํจ๋์ด ์๊ธฐ ๋๋ฌธ์ ์ด๋ฅผ ์ด์ฉํด์ ๋ผ๋ฒจ ๋ฐ์ดํฐ
# ํ์ผ ๊ฒฝ๋ก์ ๋์์ ๋๋ฒ์งธ ๋ถ๋ถ์ผ๋ก ์์ฑ๊ณผ ์์ฑ์ ๊ตฌ๋ถ
def get_label(file_path):
parts = tf.strings.split(file_path, os.path.sep)
return parts[-2] == "PNEUMONIA" # ํ๋ ด์ด๋ฉด ์์ฑ(True), ๋
ธ๋ง์ด๋ฉด ์์ฑ(False)
- ์ด๋ฏธ์ง ์ฌ์ด์ฆ ์ค์ด๊ธฐ
def decode_img(img):
img = tf.image.decode_jpeg(img, channels=3) # ์ด๋ฏธ์ง๋ฅผ uint8 tensor๋ก ์์
img = tf.image.convert_image_dtype(img, tf.float32) # float32 ํ์
์ผ๋ก ์์
img = tf.image.resize(img, IMAGE_SIZE) # ์ด๋ฏธ์ง ์ฌ์ด์ฆ๋ฅผ [180, 180] ์์
return img
- ์ด๋ฏธ์ง, ๋ผ๋ฒจ ์ฝ๊ธฐ
def process_path(file_path):
label = get_label(file_path) # ๋ผ๋ฒจ ๊ฒ์ถ
img = tf.io.read_file(file_path) # ์ด๋ฏธ์ง ์ฝ๊ธฐ
img = decode_img(img) # ์ด๋ฏธ์ง๋ฅผ ์๋ง์ ํ์์ผ๋ก ์์
return img, label
- train, validation ๋ฐ์ดํฐ ์ ๋ง๋ค๊ธฐ
# ๋น ๋ฅด๊ฒ ๋ฐ์ดํฐ ์ฒ๋ฆฌ
train_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
- ์ด๋ฏธ์ง ํ์ธ
for image, label in train_ds.take(1):
print("Image shape: ", image.numpy().shape)
print("Label: ", label.numpy())

- test ๋ ๋์ผํ๊ฒ
test_list_ds = tf.data.Dataset.list_files(test_path)
TEST_IMAGE_COUNT = tf.data.experimental.cardinality(test_list_ds).numpy()
test_ds = test_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE)
for image, label in test_ds.take(1):
print("Image shape: ", image.numpy().shape)
print("Label: ", label.numpy())
print(TEST_IMAGE_COUNT)
- data ๋ฅผ ์ข๋ ํจ์จ์ ์ผ๋ก
# random_flip_left_right : ๋๋คํ๊ฒ ์ข์ฐ๋ฅผ ๋ฐ์
def augment(image,label):
image = tf.image.random_flip_left_right(image)
return image,label
def prepare_for_training(ds, shuffle_buffer_size=1000):
ds = ds.map(
augment, # augment ํจ์ ์ ์ฉ
num_parallel_calls=2
)
# shuffle : ๊ณ ์ ํฌ๊ธฐ ๋ฒํผ๋ฅผ ์ ์งํ๊ณ ๋ฎค์์๋ก ๊ท ์ผํ๊ฒ ๋ค์ ์์๋ฅผ ์ ํ
ds = ds.shuffle(buffer_size=shuffle_buffer_size)
# repeat : ์ฌ๋ฌ ๋ฒ ๋ฐ์ดํฐ ๋ถ๋ฅด๊ธฐ
ds = ds.repeat()
# batch : ์ ํ ๋งํผ์ ๋ฐฐ์น
ds = ds.batch(BATCH_SIZE)
# prefech : GPU์ CPU๋ฅผ ํจ์จ์ ์ผ๋ก ์ฌ์ฉ
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
train_ds = prepare_for_training(train_ds)
val_ds = prepare_for_training(val_ds)
3. ๋ฐ์ดํฐ ์๊ฐํ
# ์ด๋ฏธ์ง ๋ฐฐ์น๋ฅผ ์
๋ ฅํ๋ฉด ์ฌ๋ฌ์ฅ์ ์ด๋ฏธ์ง๋ฅผ ๋ณด์ฌ์ค
def show_batch(image_batch, label_batch):
plt.figure(figsize=(10,10))
for n in range(BATCH_SIZE): # BATCH_SIZE = 30
ax = plt.subplot(5,math.ceil(BATCH_SIZE / 5),n+1)
plt.imshow(image_batch[n])
if label_batch[n]:
plt.title("PNEUMONIA")
else:
plt.title("NORMAL")
plt.axis("off")
image_batch, label_batch = next(iter(train_ds))
show_batch(image_batch.numpy(), label_batch.numpy())
4. CNN ๋ชจ๋ธ๋ง
: CNN์ Convolution Layer์ Max Pooling ๋ ์ด์ด๋ฅผ ๋ฐ๋ณต์ ์ผ๋ก stack์ ์๋ ํน์ง ์ถ์ถ(Feature Extraction)
: Fully Connected Layer๋ฅผ ๊ตฌ์ฑํ๊ณ ๋ง์ง๋ง ์ถ๋ ฅ์ธต์ Softmax๋ฅผ ์ ์ฉํ ๋ถ๋ฅ ๋ถ๋ถ์ผ๋ก ๋๋๋ค
- Convolution Block
def conv_block(filters):
block = tf.keras.Sequential([
tf.keras.layers.SeparableConv2D(filters, 3, activation='relu', padding='same'),
tf.keras.layers.SeparableConv2D(filters, 3, activation='relu', padding='same'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPool2D()
])
return block
- dense block
def dense_block(units, dropout_rate):
block = tf.keras.Sequential([
tf.keras.layers.Dense(units, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(dropout_rate)
])
return block
- ์ ์ฒด ๋ชจ๋ธ
def build_model():
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
tf.keras.layers.MaxPool2D(),
conv_block(32),
conv_block(64),
conv_block(128),
tf.keras.layers.Dropout(0.2),
conv_block(256),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Flatten(),
dense_block(512, 0.7),
dense_block(128, 0.5),
dense_block(64, 0.3),
tf.keras.layers.Dense(1, activation='sigmoid')
])
return model
5. ๋ฐ์ดํฐ ๋ถ๊ท ํ ์ฒ๋ฆฌ
- ๋๋ค ํฌ๋ ์คํธ๊ฐ ์ข๋ค
weight_for_0 = (1 / normal)*(TRAIN_IMG_COUNT)/2.0
weight_for_1 = (1 / pneumonia)*(TRAIN_IMG_COUNT)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print('Weight for NORMAL: {:.2f}'.format(weight_for_0))
print('Weight for PNEUMONIA: {:.2f}'.format(weight_for_1))
6. ๋ชจ๋ธ ํ๋ จ
with tf.device('/GPU:0'):
model = build_model()
METRICS = [
'accuracy',
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall')
]
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=METRICS
)
- EarlyStopping
from keras.callbacks import EarlyStopping
# ์ต๊ณ ์ ์ ํ๋๋ฅผ ๊ฐ์ง๋ ๋ฉ์ถฐ์ฃผ๋ ํจ์ ์ถ๊ฐ
es=EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=10)
history = model.fit(
train_ds,
steps_per_epoch = TRAIN_IMG_COUNT // BATCH_SIZE,
epochs=10, callbacks=[es],
validation_data = val_ds,
validation_steps = VAL_IMG_COUNT // BATCH_SIZE,
class_weight = class_weight,
)
7. ์๊ฐํ
fig, ax = plt.subplots(1, 4, figsize=(20, 5))
ax = ax.ravel()
for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']):
ax[i].plot(history.history[met])
ax[i].plot(history.history['val_' + met])
ax[i].set_title('Model {}'.format(met))
ax[i].set_xlabel('epochs')
ax[i].set_ylabel(met)
ax[i].legend(['train', 'val'])
- test ๋ฐ์ดํฐ๋ก ๋ชจ๋ธ ํ๊ฐ
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "https://miro.medium.com/proxy/1*pOtBHai4jFd-ujaNXPilRg.png")
Precision ( ์ ๋ฐ๋ ) ex) ํ์ง์๋ก ๋ถ๋ฅ๋ ์ฌ๋๋ค ์ค ์ค์ ์์ฑ ์๋ฏธ๋ ํ๋ฅ
Recall ( ์ฌํ์จ ) ex) ์ค์ ๋ก ์์ฑ์ธ ์๋ฏผ์ ํ์ง์๋ก ๋ถ๋ฅํ ํ๋ฅ
loss, accuracy, precision, recall = model.evaluate(test_ds)
print(f'Loss: {loss},\nAccuracy: {accuracy},\nPrecision: {precision},\nRecall: {recall}')
- Precision ๊ณผ Recall ์ trade-off ๊ด๊ณ
'๐ฉโ๐ป ์ปดํจํฐ ๊ตฌ์กฐ > Kaggle' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[Kaggle]Super Image Resolution_๊ณ ํ์ง ์ด๋ฏธ์ง ๋ง๋ค๊ธฐ (0) | 2022.02.07 |
---|---|
[Kaggle] CNN Architectures (0) | 2022.02.04 |
[Kaggle] HeartAttack ์์ธก (0) | 2022.01.31 |
[Kaggle]Breast Cancer Wisconsin (Diagnostic) Data Set_์ ๋ฐฉ์ ๋ถ๋ฅ (0) | 2022.01.28 |
[Kaggle]MBTI_Myers-Briggs Personality Type Dataset(์ฑ๊ฒฉ์ฐ๊ตฌ) (0) | 2022.01.22 |