๐ ๊ณต๋ถํ๋ ์ง์ง์ํ์นด๋ ์ฒ์์ด์ง?
[Kaggle] HeartAttack ์์ธก ๋ณธ๋ฌธ
728x90
๋ฐ์ํ
220131 ์์ฑ
<๋ณธ ๋ธ๋ก๊ทธ๋ Kaggle ์ ์ฐธ๊ณ ํด์ ๊ณต๋ถํ๋ฉฐ ์์ฑํ์์ต๋๋ค>
https://www.kaggle.com/fahadmehfoooz/heartattack-prediction-with-91-8-accuracy/notebook
1. ๋ฐ์ดํฐ ๋ก๋
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from xgboost import XGBClassifier # model
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
heart = pd.read_csv("heart.csv")
heart.head()
heart.info()
age : Age of the patient
sex : Sex of the patient
cp : Chest pain type
- 0 = Typical Angina
- 1 = Atypical Angina
- 2 = Non-anginal Pain (๋นํ์ฌ์ฆ)
- 3 = Asymptomatic (๋ฌด์ฆ์)
trtbps : Resting blood pressure (in mm Hg)
chol : Cholestoral in mg/dl fetched via BMI sensor
fbs : (fasting blood sugar > 120 mg/dl)
- 1 = True
- 0 = False
restecg : Resting electrocardiographic results
- 0 = Normal
- 1 = ST-T wave normality
- 2 = Left ventricular hypertrophy
thalachh : Maximum heart rate achieved
oldpeak : Previous peak
slp : Slope
caa : Number of major vessels
thall : Thalium Stress Test result ~ (0,3)
exng : Exercise induced angina ~
- 1 = Yes
- 0 = No
- output : Target variable
- NULL ์์
heart.isnull().sum()
- ์ค๋ณต๊ฐ ์๋?
heart[heart.duplicated()]
data:image/s3,"s3://crabby-images/b7692/b7692b914e926bc47a584732fdc2be64a34827e3" alt=""
- ํ๋ ์์ผ๋ DROP!
heart.drop_duplicates(keep = "first", inplace = True)
- ์์ฝ!
heart.describe()
- ์๊ด ๊ด๊ณ
heart.corr()
2. ๋ฐ์ดํฐ ์๊ฐํ
- ๋จ๋ ๋น์จ
sex = heart.sex.value_counts()
p = sns.countplot(data = heart, x = "sex")
plt.show()
- ์ง๋ณ ์ ํ
cp = heart.cp.value_counts()
p = sns.countplot(data = heart, x = "cp")
plt.show()
- ๊ณต๋ณต ํ๋น (๊ณต๋ณต ์ ํ์ก ๋ด ๋น ๋๋)
fbs = heart.fbs.value_counts()
p = sns.countplot(data = heart, x = "fbs")
plt.show()
- Resting electrocardiographic results
restecg = heart.restecg.value_counts()
p = sns.countplot(data = heart, x = "restecg")
plt.show()
- Exercise induced angina ~
exng = heart.exng.value_counts()
p = sns.countplot(data = heart, x = "exng")
plt.show()
- Thalium Stress Test result ~ (0,3)
thall = heart.thall.value_counts()
p = sns.countplot(data = heart, x = "thall")
plt.show()
- ๋์ด
plt.figure(figsize=(10,10))
sns.displot(heart.age, color="red", label="Age", kde= True)
plt.legend()
- Resting Blood Pressure
plt.figure(figsize=(20,20))
sns.displot(heart.trtbps , color="green", label="Resting Blood Pressure", kde= True)
plt.legend()
- Attack VS Age
plt.figure(figsize=(10,10))
sns.distplot(heart[heart['output'] == 0]["age"], color='green',kde=True,)
sns.distplot(heart[heart['output'] == 1]["age"], color='red',kde=True)
plt.title('Attack VS Age')
plt.show()
- Trtbs VS Age
plt.figure(figsize=(10,10))
sns.distplot(heart[heart['output'] == 0]["trtbps"], color='orange',kde=True,)
sns.distplot(heart[heart['output'] == 1]["trtbps"], color='red',kde=True)
plt.title('Trtbs versus Age')
plt.show()
- Thalachh VS Age
plt.figure(figsize=(10,10))
sns.distplot(heart[heart['output'] == 0]["thalachh"], color='pink',kde=True,)
sns.distplot(heart[heart['output'] == 1]["thalachh"], color='red',kde=True)
plt.title('Thalachh versus Age')
plt.show()
- Pair Plot
plt.figure(figsize=(20,20))
sns.pairplot(heart)
plt.show()
- Violin Plot
plt.figure(figsize=(13,13))
plt.subplot(2,3,1)
sns.violinplot(x = 'sex', y = 'output', data = heart)
plt.subplot(2,3,2)
sns.violinplot(x = 'thall', y = 'output', data = heart)
plt.subplot(2,3,3)
sns.violinplot(x = 'exng', y = 'output', data = heart)
plt.subplot(2,3,4)
sns.violinplot(x = 'restecg', y = 'output', data = heart)
plt.subplot(2,3,5)
sns.violinplot(x = 'cp', y = 'output', data = heart)
plt.xticks(fontsize=9, rotation=45)
plt.subplot(2,3,6)
sns.violinplot(x = 'fbs', y = 'output', data = heart)
plt.show()
3. ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
heart.head()
- output ๋ฐ๋ก
X = heart.drop(['output'], axis = 1) # ์นผ๋ผ์ผ๋ก
y = heart["output"]
- train, test ๋ถ๋ฆฌ
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)
print('Shape for training data', X_train.shape, y_train.shape)
print('Shape for testing data', X_test.shape, y_test.shape)
- scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
4. ๋ชจ๋ธ
- Logistic Regression
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
predicted = logistic.predict(X_test)
conf = confusion_matrix(y_test, predicted)
print ("logistic Confusion Matrix : \n", conf)
print ("The accuracy of Logistic Regression is : ", accuracy_score(y_test, predicted)*100, "%")
-
Gaussian Naive Bayes
-
๊ฐ ํน์ฑ์ ๊ฐ๋ณ๋ก ์ทจ๊ธํด ํ๋ผ๋ฏธํฐ๋ฅผ ํ์ตํ๊ณ ๊ทธ ํน์ฑ์์ ํด๋์ค๋ณ ํต๊ณ๋ฅผ ๋จ์ํ๊ฒ ์ทจํฉ
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
predicted = gaussian.predict(X_test)
conf = confusion_matrix(y_test, predicted)
print ("GaussianNB Confusion Matrix : \n", conf)
print("The accuracy of Gaussian Naive Bayes model is : ", accuracy_score(y_test, predicted)*100, "%")
- Bernoulli Naive Bayes
bernoulli = BernoulliNB()
bernoulli.fit(X_train, y_train)
predicted = bernoulli.predict(X_test)
conf = confusion_matrix(y_test, predicted)
print ("Bernoulli Confusion Matrix : \n", conf)
print("The accuracy of Bernoulli Naive Bayes model is : ", accuracy_score(y_test, predicted)*100, "%")
- SVM
svm = SVC()
svm.fit(X_train, y_train)
predicted = svm.predict(X_test)
conf = confusion_matrix(y_test, predicted)
print ("SVM Confusion Matrix : \n", conf)
print("The accuracy of SVM is : ", accuracy_score(y_test, predicted)*100, "%")
- Random Forest
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)
predicted = rf.predict(X_test)
print("The accuracy of Random Forest is : ", accuracy_score(y_test, predicted.round())*100, "%")
- K Nearest Neighbours
KNN = KNeighborsClassifier(n_neighbors = 1)
KNN.fit(X_train, y_train)
predicted = KNN.predict(X_test)
conf = confusion_matrix(y_test, predicted)
print ("KNN Confusion Matrix : \n", conf)
print("The accuracy of KNN is : ", accuracy_score(y_test, predicted.round())*100, "%")
- +) KNN ์ต์ ํ
error_rate = []
for i in range(1, 40):
KNN = KNeighborsClassifier(n_neighbors = i)
KNN.fit(X_train, y_train)
pred_i = KNN.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize =(10, 6))
plt.plot(range(1, 40), error_rate, color ='blue',
linestyle ='dashed', marker ='o',
markerfacecolor ='red', markersize = 10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
KNN_after = KNeighborsClassifier(n_neighbors = 7)
KNN_after.fit(X_train, y_train)
predicted = KNN_after.predict(X_test)
conf = confusion_matrix(y_test, predicted)
print ("KNN Confusion Matrix : \n", conf)
print("The accuracy of KNN is : ", accuracy_score(y_test, predicted.round())*100, "%")
- K Gradient Boosting
- ์ฌ๋ฌ ๊ฐ์ ๊ฒฐ์ ํธ๋ฆฌ๋ฅผ ์์์ ์ผ๋ก ํ์ตํ๋ ์์๋ธ์ ๋ถ์คํ ์ ํ
model = XGBClassifier(use_label_encoder=False)
model.fit(X_train, y_train)
predicted = model.predict(X_test)
conf = confusion_matrix(y_test, predicted)
print ("XGBClassifier Confusion Matrix : \n", conf)
print ("The accuracy of XGBClassifier is : ", accuracy_score(y_test, predicted)*100, "%")
5. ๊ฒฐ๋ก
knn ์ด ๋๋น
์ฐธ๊ณ ํ kaggle ์์ svm ์ด ๋์๋๋ ,,,
728x90
๋ฐ์ํ
'๐ฉโ๐ป ์ปดํจํฐ ๊ตฌ์กฐ > Kaggle' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[Kaggle]Super Image Resolution_๊ณ ํ์ง ์ด๋ฏธ์ง ๋ง๋ค๊ธฐ (0) | 2022.02.07 |
---|---|
[Kaggle] CNN Architectures (0) | 2022.02.04 |
[Kaggle] Chest X-Ray ํ์ ์ด๋ฏธ์ง ๋ถ๋ฅํ๊ธฐ (0) | 2022.01.29 |
[Kaggle]Breast Cancer Wisconsin (Diagnostic) Data Set_์ ๋ฐฉ์ ๋ถ๋ฅ (0) | 2022.01.28 |
[Kaggle]MBTI_Myers-Briggs Personality Type Dataset(์ฑ๊ฒฉ์ฐ๊ตฌ) (0) | 2022.01.22 |
Comments