In [1]:
# import data science basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [2]:
# load data
df = pd.read_csv('D:/ML & AI EDU/heart.csv')
df.head()
Out[2]:
Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR ExerciseAngina Oldpeak ST_Slope HeartDisease
0 40 M ATA 140 289 0 Normal 172 N 0.0 Up 0
1 49 F NAP 160 180 0 Normal 156 N 1.0 Flat 1
2 37 M ATA 130 283 0 ST 98 N 0.0 Up 0
3 48 F ASY 138 214 0 Normal 108 Y 1.5 Flat 1
4 54 M NAP 150 195 0 Normal 122 N 0.0 Up 0
In [3]:
# data shape
df.shape
Out[3]:
(918, 12)
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB
In [5]:
# show unique values
df.nunique()
Out[5]:
Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64
In [6]:
# data basic statistics
df.describe()
Out[6]:
Age RestingBP Cholesterol FastingBS MaxHR Oldpeak HeartDisease
count 918.000000 918.000000 918.000000 918.000000 918.000000 918.000000 918.000000
mean 53.510893 132.396514 198.799564 0.233115 136.809368 0.887364 0.553377
std 9.432617 18.514154 109.384145 0.423046 25.460334 1.066570 0.497414
min 28.000000 0.000000 0.000000 0.000000 60.000000 -2.600000 0.000000
25% 47.000000 120.000000 173.250000 0.000000 120.000000 0.000000 0.000000
50% 54.000000 130.000000 223.000000 0.000000 138.000000 0.600000 1.000000
75% 60.000000 140.000000 267.000000 0.000000 156.000000 1.500000 1.000000
max 77.000000 200.000000 603.000000 1.000000 202.000000 6.200000 1.000000
In [7]:
# missing values in decerding order
df.isnull().sum().sort_values(ascending=False)
Out[7]:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64
In [8]:
# duplicated values
df.duplicated().sum()
Out[8]:
0
In [9]:
# numerical and categorical features 
Categorical = df.select_dtypes(include=['object'])
Numerical = df.select_dtypes(include=['int64', 'float64'])
print('Categorical features:\n', Categorical)
print('Numerical features:\n', Numerical)
Categorical features:
     Sex ChestPainType RestingECG ExerciseAngina ST_Slope
0     M           ATA     Normal              N       Up
1     F           NAP     Normal              N     Flat
2     M           ATA         ST              N       Up
3     F           ASY     Normal              Y     Flat
4     M           NAP     Normal              N       Up
..   ..           ...        ...            ...      ...
913   M            TA     Normal              N     Flat
914   M           ASY     Normal              N     Flat
915   M           ASY     Normal              Y     Flat
916   F           ATA        LVH              N     Flat
917   M           NAP     Normal              N       Up

[918 rows x 5 columns]
Numerical features:
      Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  HeartDisease
0     40        140          289          0    172      0.0             0
1     49        160          180          0    156      1.0             1
2     37        130          283          0     98      0.0             0
3     48        138          214          0    108      1.5             1
4     54        150          195          0    122      0.0             0
..   ...        ...          ...        ...    ...      ...           ...
913   45        110          264          0    132      1.2             1
914   68        144          193          1    141      3.4             1
915   57        130          131          0    115      1.2             1
916   57        130          236          0    174      0.0             1
917   38        138          175          0    173      0.0             0

[918 rows x 7 columns]
In [10]:
# count target variable
df['HeartDisease'].value_counts()
Out[10]:
1    508
0    410
Name: HeartDisease, dtype: int64
In [11]:
# Normal and Heart Disease with target column
plt.figure(figsize=(10,5))
plt.pie(df['HeartDisease'].value_counts(), labels=['Heart Disease[1]', 'Normal[0]'], autopct='%1.1f%%')
plt.show()
In [12]:
# ploting corelation matrix
sns.heatmap(df.corr(), annot=True)
plt.show()
In [13]:
# ploting numerical features with target
for i in Numerical:
    plt.figure(figsize=(10,5))
    sns.countplot(x=i, data=df, hue='HeartDisease')
    plt.legend(['Normal', 'Heart Disease'])
    plt.title(i)
    plt.show()
In [14]:
#ploting categorical features with target
for i in Categorical:
    plt.figure(figsize=(10,5))
    sns.countplot(x=i, data=df, hue='HeartDisease', edgecolor='black')
    plt.legend(['Normal', 'Heart Disease'])
    plt.title(i)
    plt.show()
In [15]:
#pairplot using target HeartDisease Column
sns.pairplot(df, hue='HeartDisease')
plt.show()