[Kaggle]Breast Cancer Wisconsin (Diagnostic) Data Set_์ ๋ฐฉ์ ๋ถ๋ฅ
220128 ์์ฑ
<๋ณธ ๋ธ๋ก๊ทธ๋ Kaggle ์ ์ฐธ๊ณ ํด์ ๊ณต๋ถํ๋ฉฐ ์์ฑํ์์ต๋๋ค>
https://bigdaheta.tistory.com/33
[๋จธ์ ๋ฌ๋] ์บ๊ธ(kaggle)์์ - ์์ค์ฝ์ ์ ๋ฐฉ์ ์์ธก ๋ฐ์ดํฐ ๋ถ์ (Wisconsin Diagnostic breast cancer datase
์์ค์ฝ์ ์ ๋ฐฉ์ ๋ฐ์ดํฐ ์ธํธ๋ ์ข ์์ ํฌ๊ธฐ, ๋ชจ์ ๋ฑ์ ๋ค์ํ ์์ฑ ๊ฐ์ ๊ธฐ๋ฐ์ผ๋ก ํด๋น ์ข ์์ด ์ ์ฑ(malignmant)์ธ์ง ์์ฑ (benign)์ธ์ง๋ฅผ ๋ถ๋ฅํ ๋ฐ์ดํฐ ์ธํธ์ด๋ค. ์ด ๋ฐ์ดํฐ ์ธํธ๋ฅผ ์์๋ธ(ํฌํ,
bigdaheta.tistory.com
์ ๋ฐฉ์ ์ ์ฑ ์ข ์์ธ์ง
์์ฑ ์ข ์์ธ์ง
์ด์ง ๋ถ๋ฅ ํด๋ณด์!
1. ๋ผ์ด๋ธ๋ฌ๋ฆฌ import
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
2. ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
data = pd.read_csv("data.csv")
data
data.info()
- ์ผ๋จ float ์ด๋ int ๋ง ๋ณด์ธ๋ค... object ๋ ์๋ณด์ธ๋ค
data.isnull().sum()
- Unnamed:32 null ์กด์ฌ!
3. ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
1) diagnosis ์ซ์๋ก
data["diagnosis"].unique()
- '์์ฑ์ข ์(Benign tumor)'๊ณผ '์ ์ฑ์ข ์(Malignant tumor)
- ์ ์ฑ์ข ์์ 1, ์์ฑ์์ฑ์ 0
def cancle(a):
if 'M' in a :
return 1
else :
return 0
data["ํ์ "] = data["diagnosis"].apply(cancle)
data.head(10)
2) ํ์์๋๊ฑฐ drop
data.columns
- id ํ์์๊ณ , diagnosis ๋ ์ซ์๋ก ํ๊ณ , null ์๋ Unnamed: 32 ์์ ๋ฒ๋ ค~!
total = data.drop(columns= ["id", "diagnosis", "Unnamed: 32"])
total.head()
4. train/test ๋ถํ
X = total.drop(['ํ์ '], axis = 1) # ์นผ๋ผ์ผ๋ก
y = total["ํ์ "]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
- test_size์๋ test set์ ๋น์จ์ ์ ๋ ฅ
- stratify์๋ ์ธต ๊ตฌ๋ถ ๋ณ์์ด๋ฆ์ ์ ๋ ฅ -> ๊ฐ ์ธต๋ณ๋ก ๋๋์ด์ test_size๋น์จ ์ ์ฉ ์ถ์ถ
- shuffle=True ๋ฅผ ์ง์ ํด์ฃผ๋ฉด ๋ฌด์์ ์ถ์ถ(random sampling)
- ์ฒด๊ณ์ ์ถ์ถ(systematic sampling)์ ํ๊ณ ์ถ๋ค๋ฉด shuffle=False๋ฅผ ์ง์
- random_state ๋ ์ฌํ๊ฐ๋ฅ์ฑ์ ์ํด์ ๋์ ์ด๊ธฐ๊ฐ์ผ๋ก ์๋ฌด ์ซ์๋ ์ง์
5. ๋ชจ๋ธ ๊ตฌ์ถ
1) voting : ๋ญ๊ฐ ์ ์ผ ๋ซ๋~
- ๋ค์์ ๋ถ๋ฅ๊ธฐ์ ๋ ์ด๋ธ ๊ฐ ๊ฒฐ์ ํ๋ฅ ์ ๋ชจ๋ ๋ํ๊ณ ํ๊ท ๋ธ ํ๋ฅ ๊ฐ์ด ๊ฐ์ฅ ๋์ ๋ ์ด๋ธ์ ์ต์ข ๋ณดํ ๊ฒฐ๊ณผ๊ฐ์ผ๋ก ์ ์ => ์ํํธ ๋ณดํ
- ๋ค์์ ๋ถ๋ฅ๊ธฐ๊ฐ ์์ธกํ ์์ธก๊ฐ์ ์ต์ข ๋ณดํ ๊ฒฐ๊ณผ๊ฐ์ผ๋ก ์ ์ => ํ๋ ๋ณดํ
logistic = LogisticRegression( solver = "liblinear",
penalty = "l2",
C = 0.001,
random_state = 1)
tree = DecisionTreeClassifier(max_depth = None,
criterion="entropy",
random_state=1)
knn = KNeighborsClassifier(n_neighbors=1,
p = 2,
metric = "minkowski")
voting_estimators = [("logistic", logistic), ("tree", tree), ("knn", knn)]
voting = VotingClassifier(estimators=voting_estimators,
voting = "soft")
clf_labels1 = ["Logistic regression", "Decision Tree", "KNN", "Majority voting"]
all_clf1 = [logistic, tree, knn, voting]
2) ๋ฐฐ๊น : ์ฌ๋ฌ ๊ฐ์ง ๋ถ๋ฅ ๋ชจ๋ธ ์ค ํ๊ฐ์ง ๋ชจ๋ธ์๋ง ์ง์ค ๋ชจ๋ธ ๊ตฌ์ถ
tree = DecisionTreeClassifier (max_depth= None,
criterion="entropy",
random_state=1)
forest = RandomForestClassifier(criterion="gini",
n_estimators=500, # ๋ฐ์ดํฐ ์ํ ๋ช๊ฐ
random_state=1)
clf_labels2 = ["Decision Tree", "Random Forest"]
all_clf2 = [tree, forest]
3) ๋ถ์คํ
tree = DecisionTreeClassifier(max_depth=1,
criterion="entropy",
random_state=1)
adaboost = AdaBoostClassifier (base_estimator=tree,
n_estimators=500,
learning_rate = 0.1,
random_state=1)
clf_labels3 = ["Decision Tree", "Ada Boost"]
all_clf3 = [tree, adaboost]
4) AUC
for clf, label in zip(all_clf1, clf_labels1) :
scores = cross_val_score(estimator=clf,
X = X_train,
y = y_train,
cv = 10,
scoring ="roc_auc")
print("ROC AUC : %.3f ( +/- %.3f) [%s]"
% (scores.mean(), scores.std(), label))
- ์ Voting ์ด ๋๋ค
for clf, label in zip(all_clf2, clf_labels2) :
scores = cross_val_score(estimator=clf,
X = X_train,
y = y_train,
cv = 10,
scoring ="roc_auc")
print("ROC AUC : %.3f ( +/- %.3f) [%s]"
% (scores.mean(), scores.std(), label))
- ๋๋ค ํฌ๋ ์คํธ๊ฐ ์ข๋ค
for clf, label in zip(all_clf3, clf_labels3) :
scores = cross_val_score(estimator=clf,
X = X_train,
y = y_train,
cv = 10,
scoring ="roc_auc")
print("ROC AUC : %.3f ( +/- %.3f) [%s]"
% (scores.mean(), scores.std(), label))
- AdaBoost ๊ฐ ์ข๋ค
6. ROC ๊ณก์ ๊ทธ๋ฆฌ๊ธฐ
colors = ["orange", "pink","blue", "green"]
linestyles = [':', "--", "-.", "-"]
for clf, label, clr, ls in zip(all_clf1, clf_labels1, colors, linestyles) :
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]
fpr, tpr, threshold = roc_curve(y_true = y_test,
y_score=y_pred)
roc_auc = auc(x=fpr, y=tpr)
plt.plot(fpr, tpr, color = clr, linestyle = ls,
label ="%s (auc = %.3f) " %(label, roc_auc))
plt.legend(loc = "lower right")
plt.plot([0,1], [0,1], linestyle = "--", color = "gray", linewidth = 2)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid(alpha = 0.5)
plt.xlabel("False positive rate (FPR)")
plt.ylabel("True positive rate (TPR)")
plt.title("Voting")
plt.show()
colors = ["orange", "pink","blue", "green"]
linestyles = [':', "--", "-.", "-"]
for clf, label, clr, ls in zip(all_clf2, clf_labels2, colors, linestyles) :
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]
fpr, tpr, threshold = roc_curve(y_true = y_test,
y_score=y_pred)
roc_auc = auc(x=fpr, y=tpr)
plt.plot(fpr, tpr, color = clr, linestyle = ls,
label ="%s (auc = %.3f) " %(label, roc_auc))
plt.legend(loc = "lower right")
plt.plot([0,1], [0,1], linestyle = "--", color = "gray", linewidth = 2)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid(alpha = 0.5)
plt.xlabel("False positive rate (FPR)")
plt.ylabel("True positive rate (TPR)")
plt.title("RandomForest")
plt.show()
colors = ["orange", "pink","blue", "green"]
linestyles = [':', "--", "-.", "-"]
for clf, label, clr, ls in zip(all_clf3, clf_labels3, colors, linestyles) :
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]
fpr, tpr, threshold = roc_curve(y_true = y_test,
y_score=y_pred)
roc_auc = auc(x=fpr, y=tpr)
plt.plot(fpr, tpr, color = clr, linestyle = ls,
label ="%s (auc = %.3f) " %(label, roc_auc))
plt.legend(loc = "lower right")
plt.plot([0,1], [0,1], linestyle = "--", color = "gray", linewidth = 2)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid(alpha = 0.5)
plt.xlabel("False positive rate (FPR)")
plt.ylabel("True positive rate (TPR)")
plt.title("AdaBoost")
plt.show()
7. ์ ์ค ๋ถ๋ฅํ
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print("RandomForest")
print("์๋ชป ๋ถ๋ฅ๋ ์ํ ๊ฐ์ : %d" %(y_test != y_pred).sum())
print("์ ํ๋ : %.3f" % accuracy_score(y_test, y_pred))
print("์ ๋ฐ๋ : %.3f" % precision_score(y_true = y_test, y_pred = y_pred))
print("์ฌํ์จ : %.3f" % recall_score(y_true=y_test, y_pred=y_pred))
print("F1 : %.3f" % f1_score(y_true=y_test, y_pred=y_pred))
adaboost.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print("AdaBoost")
print("์๋ชป ๋ถ๋ฅ๋ ์ํ ๊ฐ์ : %d" %(y_test != y_pred).sum())
print("์ ํ๋ : %.3f" % accuracy_score(y_test, y_pred))
print("์ ๋ฐ๋ : %.3f" % precision_score(y_true = y_test, y_pred = y_pred))
print("์ฌํ์จ : %.3f" % recall_score(y_true=y_test, y_pred=y_pred))
print("F1 : %.3f" % f1_score(y_true=y_test, y_pred=y_pred))
8. ์ต์ ํ
voting.get_params()
params = {"logistic__C" : [0.001, 0.1, 100.0],
"tree__max_depth" : [1, 2, 3, 4, 5],
"knn__n_neighbors" : [1, 2, 3, 4, 5]}
grid = GridSearchCV(estimator=voting,
param_grid=params,
cv = 10,
scoring = "roc_auc",
)
grid.fit(X_train, y_train)
for i, _ in enumerate(grid.cv_results_["mean_test_score"]) :
print("%.3f +/- %.3f %r"
%(grid.cv_results_["mean_test_score"][i],
grid.cv_results_["std_test_score"][i] / 2.0,
grid.cv_results_["params"][i]))
print("์ต์ ์ ํ๋ผ๋ฏธํฐ : %s" %grid.best_params_)
print("ACU : %.3f" % grid.best_score_)
9. ํน์ฑ ์ค์๋
- ๋๋ค ํฌ๋ ์คํธ๋ ๋ณ๋์ ํ๋ผ๋ฏธํฐ ํ๋ ํ์ ์์
feat_labels = X.columns
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(X_train.shape[1]) :
print("%2d) %-*s %f" % (i + 1, 30, feat_labels[indices[i]],
importances[indices[i]]))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation = 90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()