11장 데이터 다루기¶
In [ ]:
Copied!
#실행에 필요한 pandas와 seaborn 라이브러리를 설치합니다. (이미 설치되어 있으면 이 과정은 생략합니다.)
!pip install pandas
!pip install seaborn
#실행에 필요한 pandas와 seaborn 라이브러리를 설치합니다. (이미 설치되어 있으면 이 과정은 생략합니다.)
!pip install pandas
!pip install seaborn
In [1]:
Copied!
#필요한 라이브러리를 불러옵니다.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 피마 인디언 당뇨병 데이터셋을 불러옵니다.
df = pd.read_csv('./data/pima-indians-diabetes3.csv')
#필요한 라이브러리를 불러옵니다.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 피마 인디언 당뇨병 데이터셋을 불러옵니다.
df = pd.read_csv('./data/pima-indians-diabetes3.csv')
In [2]:
Copied!
# 처음 5줄을 봅니다.
df.head(5)
# 처음 5줄을 봅니다.
df.head(5)
Out[2]:
| pregnant | plasma | pressure | thickness | insulin | bmi | pedigree | age | diabetes | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
In [3]:
Copied!
# 정상과 당뇨 환자가 각각 몇 명씩인지 조사해 봅니다.
df["diabetes"].value_counts()
# 정상과 당뇨 환자가 각각 몇 명씩인지 조사해 봅니다.
df["diabetes"].value_counts()
Out[3]:
0 500 1 268 Name: diabetes, dtype: int64
In [4]:
Copied!
# 각 정보별 특징을 좀 더 자세히 출력합니다.
df.describe()
# 각 정보별 특징을 좀 더 자세히 출력합니다.
df.describe()
Out[4]:
| pregnant | plasma | pressure | thickness | insulin | bmi | pedigree | age | diabetes | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
In [5]:
Copied!
# 각 항목이 어느정도의 상관 관계를 가지고 있는지 알아봅니다.
df.corr()
# 각 항목이 어느정도의 상관 관계를 가지고 있는지 알아봅니다.
df.corr()
Out[5]:
| pregnant | plasma | pressure | thickness | insulin | bmi | pedigree | age | diabetes | |
|---|---|---|---|---|---|---|---|---|---|
| pregnant | 1.000000 | 0.129459 | 0.141282 | -0.081672 | -0.073535 | 0.017683 | -0.033523 | 0.544341 | 0.221898 |
| plasma | 0.129459 | 1.000000 | 0.152590 | 0.057328 | 0.331357 | 0.221071 | 0.137337 | 0.263514 | 0.466581 |
| pressure | 0.141282 | 0.152590 | 1.000000 | 0.207371 | 0.088933 | 0.281805 | 0.041265 | 0.239528 | 0.065068 |
| thickness | -0.081672 | 0.057328 | 0.207371 | 1.000000 | 0.436783 | 0.392573 | 0.183928 | -0.113970 | 0.074752 |
| insulin | -0.073535 | 0.331357 | 0.088933 | 0.436783 | 1.000000 | 0.197859 | 0.185071 | -0.042163 | 0.130548 |
| bmi | 0.017683 | 0.221071 | 0.281805 | 0.392573 | 0.197859 | 1.000000 | 0.140647 | 0.036242 | 0.292695 |
| pedigree | -0.033523 | 0.137337 | 0.041265 | 0.183928 | 0.185071 | 0.140647 | 1.000000 | 0.033561 | 0.173844 |
| age | 0.544341 | 0.263514 | 0.239528 | -0.113970 | -0.042163 | 0.036242 | 0.033561 | 1.000000 | 0.238356 |
| diabetes | 0.221898 | 0.466581 | 0.065068 | 0.074752 | 0.130548 | 0.292695 | 0.173844 | 0.238356 | 1.000000 |
In [6]:
Copied!
# 데이터 간의 상관 관계를 그래프로 표현해 봅니다.
colormap = plt.cm.gist_heat #그래프의 색상 구성을 정합니다.
plt.figure(figsize=(12,12)) #그래프의 크기를 정합니다.
# 그래프의 속성을 결정합니다. vmax의 값을 0.5로 지정해 0.5에 가까울수록 밝은색으로 표시되게 합니다.
sns.heatmap(df.corr(),linewidths=0.1,vmax=0.5, cmap=colormap, linecolor='white', annot=True)
plt.show()
# 데이터 간의 상관 관계를 그래프로 표현해 봅니다.
colormap = plt.cm.gist_heat #그래프의 색상 구성을 정합니다.
plt.figure(figsize=(12,12)) #그래프의 크기를 정합니다.
# 그래프의 속성을 결정합니다. vmax의 값을 0.5로 지정해 0.5에 가까울수록 밝은색으로 표시되게 합니다.
sns.heatmap(df.corr(),linewidths=0.1,vmax=0.5, cmap=colormap, linecolor='white', annot=True)
plt.show()
4. 중요한 데이터 추출하기¶
In [7]:
Copied!
#plasma를 기준으로 각각 정상과 당뇨가 어느 정도 비율로 분포하는지 살펴봅니다.
plt.hist(x=[df.plasma[df.diabetes==0], df.plasma[df.diabetes==1]], bins=30, histtype='barstacked', label=['normal','diabetes'])
plt.legend()
#plasma를 기준으로 각각 정상과 당뇨가 어느 정도 비율로 분포하는지 살펴봅니다.
plt.hist(x=[df.plasma[df.diabetes==0], df.plasma[df.diabetes==1]], bins=30, histtype='barstacked', label=['normal','diabetes'])
plt.legend()
Out[7]:
<matplotlib.legend.Legend at 0x213af7e80d0>
In [8]:
Copied!
#BMI를 기준으로 각각 정상과 당뇨가 어느 정도 비율로 분포하는지 살펴봅니다.
plt.hist(x=[df.bmi[df.diabetes==0], df.bmi[df.diabetes==1]], bins=30, histtype='barstacked', label=['normal','diabetes'])
plt.legend()
#BMI를 기준으로 각각 정상과 당뇨가 어느 정도 비율로 분포하는지 살펴봅니다.
plt.hist(x=[df.bmi[df.diabetes==0], df.bmi[df.diabetes==1]], bins=30, histtype='barstacked', label=['normal','diabetes'])
plt.legend()
Out[8]:
<matplotlib.legend.Legend at 0x213afc00790>
5. 피마 인디언 당뇨병 예측 실행¶
In [9]:
Copied!
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# pandas 라이브러리를 불러옵니다.
import pandas as pd
# 피마 인디언 당뇨병 데이터셋을 불러옵니다.
df = pd.read_csv('./data/pima-indians-diabetes3.csv')
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# pandas 라이브러리를 불러옵니다.
import pandas as pd
# 피마 인디언 당뇨병 데이터셋을 불러옵니다.
df = pd.read_csv('./data/pima-indians-diabetes3.csv')
In [10]:
Copied!
# 세부 정보를 X로 지정합니다.
X = df.iloc[:,0:8]
# 당뇨병 여부를 Y로 지정합니다.
y = df.iloc[:,8]
# 세부 정보를 X로 지정합니다.
X = df.iloc[:,0:8]
# 당뇨병 여부를 Y로 지정합니다.
y = df.iloc[:,8]
In [11]:
Copied!
# 모델을 설정합니다.
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu', name='Dense_1'))
model.add(Dense(8, activation='relu', name='Dense_2'))
model.add(Dense(1, activation='sigmoid',name='Dense_3'))
model.summary()
# 모델을 설정합니다.
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu', name='Dense_1'))
model.add(Dense(8, activation='relu', name='Dense_2'))
model.add(Dense(1, activation='sigmoid',name='Dense_3'))
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= Dense_1 (Dense) (None, 12) 108 _________________________________________________________________ Dense_2 (Dense) (None, 8) 104 _________________________________________________________________ Dense_3 (Dense) (None, 1) 9 ================================================================= Total params: 221 Trainable params: 221 Non-trainable params: 0 _________________________________________________________________
In [12]:
Copied!
# 모델을 컴파일합니다.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 모델을 실행합니다.
history=model.fit(X, y, epochs=100, batch_size=5)
# 모델을 컴파일합니다.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 모델을 실행합니다.
history=model.fit(X, y, epochs=100, batch_size=5)
Epoch 1/100 154/154 [==============================] - 2s 2ms/step - loss: 1.7955 - accuracy: 0.5430 Epoch 2/100 154/154 [==============================] - 0s 2ms/step - loss: 0.7971 - accuracy: 0.6432 Epoch 3/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6672 - accuracy: 0.6419 Epoch 4/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6360 - accuracy: 0.6406 Epoch 5/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6389 - accuracy: 0.6523 Epoch 6/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6424 - accuracy: 0.6549 Epoch 7/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6341 - accuracy: 0.6536 Epoch 8/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6282 - accuracy: 0.6589 Epoch 9/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6304 - accuracy: 0.6589 Epoch 10/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6226 - accuracy: 0.6549 Epoch 11/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6226 - accuracy: 0.6523 Epoch 12/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6312 - accuracy: 0.6576 Epoch 13/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6157 - accuracy: 0.6615 Epoch 14/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6070 - accuracy: 0.6589 Epoch 15/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6040 - accuracy: 0.6549 Epoch 16/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6009 - accuracy: 0.6589 Epoch 17/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6093 - accuracy: 0.6536 Epoch 18/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6081 - accuracy: 0.6602 Epoch 19/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6077 - accuracy: 0.6562 Epoch 20/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6107 - accuracy: 0.6562 Epoch 21/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6200 - accuracy: 0.6536 Epoch 22/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5986 - accuracy: 0.6615 Epoch 23/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6011 - accuracy: 0.6615 Epoch 24/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5976 - accuracy: 0.6589 Epoch 25/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6000 - accuracy: 0.6602 Epoch 26/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5955 - accuracy: 0.6628 Epoch 27/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5923 - accuracy: 0.6602 Epoch 28/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6019 - accuracy: 0.6602 Epoch 29/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5965 - accuracy: 0.6615 Epoch 30/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5958 - accuracy: 0.6615 Epoch 31/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5957 - accuracy: 0.6602 Epoch 32/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6010 - accuracy: 0.6602 Epoch 33/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6026 - accuracy: 0.6615 Epoch 34/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5958 - accuracy: 0.6615 Epoch 35/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5942 - accuracy: 0.6628 Epoch 36/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6047 - accuracy: 0.6615 Epoch 37/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5940 - accuracy: 0.6615 Epoch 38/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5925 - accuracy: 0.6549 Epoch 39/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5959 - accuracy: 0.6510 Epoch 40/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5999 - accuracy: 0.6510 Epoch 41/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5913 - accuracy: 0.6510 Epoch 42/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6047 - accuracy: 0.6510 Epoch 43/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5920 - accuracy: 0.6510 Epoch 44/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6009 - accuracy: 0.6523 Epoch 45/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5979 - accuracy: 0.6510 Epoch 46/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5969 - accuracy: 0.6510 Epoch 47/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5968 - accuracy: 0.6510 Epoch 48/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6029 - accuracy: 0.6523 Epoch 49/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6003 - accuracy: 0.6523 Epoch 50/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5969 - accuracy: 0.6510 Epoch 51/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5932 - accuracy: 0.6510 Epoch 52/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5954 - accuracy: 0.6510 Epoch 53/100 154/154 [==============================] - 0s 2ms/step - loss: 0.6005 - accuracy: 0.6536 Epoch 54/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5905 - accuracy: 0.6615 Epoch 55/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5931 - accuracy: 0.6576 Epoch 56/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5897 - accuracy: 0.6536 Epoch 57/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5979 - accuracy: 0.6510 Epoch 58/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5891 - accuracy: 0.6510 Epoch 59/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5891 - accuracy: 0.6523 Epoch 60/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5895 - accuracy: 0.6523 Epoch 61/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5929 - accuracy: 0.6523 Epoch 62/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5951 - accuracy: 0.6523 Epoch 63/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5888 - accuracy: 0.6523 Epoch 64/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5956 - accuracy: 0.6497 Epoch 65/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5911 - accuracy: 0.6497 Epoch 66/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5931 - accuracy: 0.6510 Epoch 67/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5895 - accuracy: 0.6523 Epoch 68/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5936 - accuracy: 0.6523 Epoch 69/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5881 - accuracy: 0.6510 Epoch 70/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5838 - accuracy: 0.6510 Epoch 71/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5877 - accuracy: 0.6510 Epoch 72/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5874 - accuracy: 0.6523 Epoch 73/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5862 - accuracy: 0.6523 Epoch 74/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5819 - accuracy: 0.6523 Epoch 75/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5819 - accuracy: 0.6523 Epoch 76/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5882 - accuracy: 0.6523 Epoch 77/100 154/154 [==============================] - 1s 4ms/step - loss: 0.5853 - accuracy: 0.6510 Epoch 78/100 154/154 [==============================] - 0s 3ms/step - loss: 0.5833 - accuracy: 0.6510 Epoch 79/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5760 - accuracy: 0.6589 Epoch 80/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5810 - accuracy: 0.6602 Epoch 81/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5754 - accuracy: 0.6523 Epoch 82/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5825 - accuracy: 0.6510 Epoch 83/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5790 - accuracy: 0.6510 Epoch 84/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5797 - accuracy: 0.6523 Epoch 85/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5827 - accuracy: 0.6523 Epoch 86/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5780 - accuracy: 0.6523 Epoch 87/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5774 - accuracy: 0.6419 Epoch 88/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5784 - accuracy: 0.6888 Epoch 89/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5737 - accuracy: 0.7057 Epoch 90/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5721 - accuracy: 0.7018 Epoch 91/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5750 - accuracy: 0.6979 Epoch 92/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5679 - accuracy: 0.7031 Epoch 93/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5727 - accuracy: 0.7031 Epoch 94/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5718 - accuracy: 0.7135 Epoch 95/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5782 - accuracy: 0.6953 Epoch 96/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5720 - accuracy: 0.7096 Epoch 97/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5683 - accuracy: 0.7188 Epoch 98/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5784 - accuracy: 0.7057 Epoch 99/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5741 - accuracy: 0.7096 Epoch 100/100 154/154 [==============================] - 0s 2ms/step - loss: 0.5705 - accuracy: 0.7161