# import data science basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# load data
df = pd.read_csv('D:/ML & AI EDU/heart.csv')
df.head()
Age | Sex | ChestPainType | RestingBP | Cholesterol | FastingBS | RestingECG | MaxHR | ExerciseAngina | Oldpeak | ST_Slope | HeartDisease | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40 | M | ATA | 140 | 289 | 0 | Normal | 172 | N | 0.0 | Up | 0 |
1 | 49 | F | NAP | 160 | 180 | 0 | Normal | 156 | N | 1.0 | Flat | 1 |
2 | 37 | M | ATA | 130 | 283 | 0 | ST | 98 | N | 0.0 | Up | 0 |
3 | 48 | F | ASY | 138 | 214 | 0 | Normal | 108 | Y | 1.5 | Flat | 1 |
4 | 54 | M | NAP | 150 | 195 | 0 | Normal | 122 | N | 0.0 | Up | 0 |
# data shape
df.shape
(918, 12)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 918 entries, 0 to 917 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 918 non-null int64 1 Sex 918 non-null object 2 ChestPainType 918 non-null object 3 RestingBP 918 non-null int64 4 Cholesterol 918 non-null int64 5 FastingBS 918 non-null int64 6 RestingECG 918 non-null object 7 MaxHR 918 non-null int64 8 ExerciseAngina 918 non-null object 9 Oldpeak 918 non-null float64 10 ST_Slope 918 non-null object 11 HeartDisease 918 non-null int64 dtypes: float64(1), int64(6), object(5) memory usage: 86.2+ KB
# show unique values
df.nunique()
Age 50 Sex 2 ChestPainType 4 RestingBP 67 Cholesterol 222 FastingBS 2 RestingECG 3 MaxHR 119 ExerciseAngina 2 Oldpeak 53 ST_Slope 3 HeartDisease 2 dtype: int64
# data basic statistics
df.describe()
Age | RestingBP | Cholesterol | FastingBS | MaxHR | Oldpeak | HeartDisease | |
---|---|---|---|---|---|---|---|
count | 918.000000 | 918.000000 | 918.000000 | 918.000000 | 918.000000 | 918.000000 | 918.000000 |
mean | 53.510893 | 132.396514 | 198.799564 | 0.233115 | 136.809368 | 0.887364 | 0.553377 |
std | 9.432617 | 18.514154 | 109.384145 | 0.423046 | 25.460334 | 1.066570 | 0.497414 |
min | 28.000000 | 0.000000 | 0.000000 | 0.000000 | 60.000000 | -2.600000 | 0.000000 |
25% | 47.000000 | 120.000000 | 173.250000 | 0.000000 | 120.000000 | 0.000000 | 0.000000 |
50% | 54.000000 | 130.000000 | 223.000000 | 0.000000 | 138.000000 | 0.600000 | 1.000000 |
75% | 60.000000 | 140.000000 | 267.000000 | 0.000000 | 156.000000 | 1.500000 | 1.000000 |
max | 77.000000 | 200.000000 | 603.000000 | 1.000000 | 202.000000 | 6.200000 | 1.000000 |
# missing values in decerding order
df.isnull().sum().sort_values(ascending=False)
Age 0 Sex 0 ChestPainType 0 RestingBP 0 Cholesterol 0 FastingBS 0 RestingECG 0 MaxHR 0 ExerciseAngina 0 Oldpeak 0 ST_Slope 0 HeartDisease 0 dtype: int64
# duplicated values
df.duplicated().sum()
0
# numerical and categorical features
Categorical = df.select_dtypes(include=['object'])
Numerical = df.select_dtypes(include=['int64', 'float64'])
print('Categorical features:\n', Categorical)
print('Numerical features:\n', Numerical)
Categorical features: Sex ChestPainType RestingECG ExerciseAngina ST_Slope 0 M ATA Normal N Up 1 F NAP Normal N Flat 2 M ATA ST N Up 3 F ASY Normal Y Flat 4 M NAP Normal N Up .. .. ... ... ... ... 913 M TA Normal N Flat 914 M ASY Normal N Flat 915 M ASY Normal Y Flat 916 F ATA LVH N Flat 917 M NAP Normal N Up [918 rows x 5 columns] Numerical features: Age RestingBP Cholesterol FastingBS MaxHR Oldpeak HeartDisease 0 40 140 289 0 172 0.0 0 1 49 160 180 0 156 1.0 1 2 37 130 283 0 98 0.0 0 3 48 138 214 0 108 1.5 1 4 54 150 195 0 122 0.0 0 .. ... ... ... ... ... ... ... 913 45 110 264 0 132 1.2 1 914 68 144 193 1 141 3.4 1 915 57 130 131 0 115 1.2 1 916 57 130 236 0 174 0.0 1 917 38 138 175 0 173 0.0 0 [918 rows x 7 columns]
# count target variable
df['HeartDisease'].value_counts()
1 508 0 410 Name: HeartDisease, dtype: int64
# Normal and Heart Disease with target column
plt.figure(figsize=(10,5))
plt.pie(df['HeartDisease'].value_counts(), labels=['Heart Disease[1]', 'Normal[0]'], autopct='%1.1f%%')
plt.show()
# ploting corelation matrix
sns.heatmap(df.corr(), annot=True)
plt.show()
# ploting numerical features with target
for i in Numerical:
plt.figure(figsize=(10,5))
sns.countplot(x=i, data=df, hue='HeartDisease')
plt.legend(['Normal', 'Heart Disease'])
plt.title(i)
plt.show()
#ploting categorical features with target
for i in Categorical:
plt.figure(figsize=(10,5))
sns.countplot(x=i, data=df, hue='HeartDisease', edgecolor='black')
plt.legend(['Normal', 'Heart Disease'])
plt.title(i)
plt.show()
#pairplot using target HeartDisease Column
sns.pairplot(df, hue='HeartDisease')
plt.show()