import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
df = pd.read_csv('House_Rent_Dataset.csv')
df.head()
Posted On | BHK | Rent | Size | Floor | Area Type | Area Locality | City | Furnishing Status | Tenant Preferred | Bathroom | Point of Contact | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2022-05-18 | 2 | 10000 | 1100 | Ground out of 2 | Super Area | Bandel | Kolkata | Unfurnished | Bachelors/Family | 2 | Contact Owner |
1 | 2022-05-13 | 2 | 20000 | 800 | 1 out of 3 | Super Area | Phool Bagan, Kankurgachi | Kolkata | Semi-Furnished | Bachelors/Family | 1 | Contact Owner |
2 | 2022-05-16 | 2 | 17000 | 1000 | 1 out of 3 | Super Area | Salt Lake City Sector 2 | Kolkata | Semi-Furnished | Bachelors/Family | 1 | Contact Owner |
3 | 2022-07-04 | 2 | 10000 | 800 | 1 out of 2 | Super Area | Dumdum Park | Kolkata | Unfurnished | Bachelors/Family | 1 | Contact Owner |
4 | 2022-05-09 | 2 | 7500 | 850 | 1 out of 2 | Carpet Area | South Dum Dum | Kolkata | Unfurnished | Bachelors | 1 | Contact Owner |
df.sample()
Posted On | BHK | Rent | Size | Floor | Area Type | Area Locality | City | Furnishing Status | Tenant Preferred | Bathroom | Point of Contact | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1162 | 2022-06-04 | 2 | 40000 | 625 | 6 out of 14 | Carpet Area | Kailash Tower, Powai | Mumbai | Semi-Furnished | Bachelors | 2 | Contact Agent |
df.shape
(4746, 12)
df.columns
Index(['Posted On', 'BHK', 'Rent', 'Size', 'Floor', 'Area Type', 'Area Locality', 'City', 'Furnishing Status', 'Tenant Preferred', 'Bathroom', 'Point of Contact'], dtype='object')
Creating a histplot of columns that contain Numeric Values.
df.hist(bins=30 ,figsize=(20,15))
plt.show()
plt.figure()
sns.heatmap(df.corr() , annot=True , cmap="PuBu")
<AxesSubplot:>
sns.countplot(df['Area Type'])
<AxesSubplot:xlabel='Area Type', ylabel='count'>
sns.countplot(df['Furnishing Status'])
<AxesSubplot:xlabel='Furnishing Status', ylabel='count'>
sns.countplot(df['Tenant Preferred'])
<AxesSubplot:xlabel='Tenant Preferred', ylabel='count'>
sns.countplot(df['City'])
<AxesSubplot:xlabel='City', ylabel='count'>
sns.set_context('notebook', font_scale = 1.3)
plt.figure(figsize=(22, 7))
ax = sns.barplot(x=df['City'],
y=df['Rent'],
hue = df['Furnishing Status'],
palette='viridis',
ci = None)
plt.ylabel('Rent');
for p in ax.patches:
ax.annotate(int(p.get_height()), (p.get_x() + 0.14, p.get_height() + 1), ha = 'center', va = 'bottom', color = 'Black')
Changing data type of Posted on column from object to datetime
df['Posted On'] = pd.to_datetime(df['Posted On'])
df['month posted'] = df['Posted On'].dt.month
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4746 entries, 0 to 4745 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Posted On 4746 non-null datetime64[ns] 1 BHK 4746 non-null int64 2 Rent 4746 non-null int64 3 Size 4746 non-null int64 4 Floor 4746 non-null object 5 Area Type 4746 non-null object 6 Area Locality 4746 non-null object 7 City 4746 non-null object 8 Furnishing Status 4746 non-null object 9 Tenant Preferred 4746 non-null object 10 Bathroom 4746 non-null int64 11 Point of Contact 4746 non-null object 12 month posted 4746 non-null int64 dtypes: datetime64[ns](1), int64(5), object(7) memory usage: 482.1+ KB
df.drop('Posted On', axis = 1, inplace= True)
df
BHK | Rent | Size | Floor | Area Type | Area Locality | City | Furnishing Status | Tenant Preferred | Bathroom | Point of Contact | month posted | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 10000 | 1100 | Ground out of 2 | Super Area | Bandel | Kolkata | Unfurnished | Bachelors/Family | 2 | Contact Owner | 5 |
1 | 2 | 20000 | 800 | 1 out of 3 | Super Area | Phool Bagan, Kankurgachi | Kolkata | Semi-Furnished | Bachelors/Family | 1 | Contact Owner | 5 |
2 | 2 | 17000 | 1000 | 1 out of 3 | Super Area | Salt Lake City Sector 2 | Kolkata | Semi-Furnished | Bachelors/Family | 1 | Contact Owner | 5 |
3 | 2 | 10000 | 800 | 1 out of 2 | Super Area | Dumdum Park | Kolkata | Unfurnished | Bachelors/Family | 1 | Contact Owner | 7 |
4 | 2 | 7500 | 850 | 1 out of 2 | Carpet Area | South Dum Dum | Kolkata | Unfurnished | Bachelors | 1 | Contact Owner | 5 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4741 | 2 | 15000 | 1000 | 3 out of 5 | Carpet Area | Bandam Kommu | Hyderabad | Semi-Furnished | Bachelors/Family | 2 | Contact Owner | 5 |
4742 | 3 | 29000 | 2000 | 1 out of 4 | Super Area | Manikonda, Hyderabad | Hyderabad | Semi-Furnished | Bachelors/Family | 3 | Contact Owner | 5 |
4743 | 3 | 35000 | 1750 | 3 out of 5 | Carpet Area | Himayath Nagar, NH 7 | Hyderabad | Semi-Furnished | Bachelors/Family | 3 | Contact Agent | 7 |
4744 | 3 | 45000 | 1500 | 23 out of 34 | Carpet Area | Gachibowli | Hyderabad | Semi-Furnished | Family | 2 | Contact Agent | 7 |
4745 | 2 | 15000 | 1000 | 4 out of 5 | Carpet Area | Suchitra Circle | Hyderabad | Unfurnished | Bachelors | 2 | Contact Owner | 5 |
4746 rows × 12 columns
Rename Columns by using map function
df['Floor']=df['Floor'].str[0]
df['Floor']=df['Floor'].map({'L':-1,'G':0,"U":0,'1':1,'2':2,'3':3,'4':4,'5':5,'6':6,'7':7,'8':8,'9':9})
df["Floor"].unique()
array([ 0, 1, 2, 4, 3, 5, 7, 8, -1, 6, 9], dtype=int64)
df['Area Type'].unique()
array(['Super Area', 'Carpet Area', 'Built Area'], dtype=object)
def one_hot_encode(data, column):
encoded = pd.get_dummies(data[column], drop_first= True)
data = data.drop(column, axis = 1)
data = data.join(encoded)
return data
features = ['Area Type','City', 'Furnishing Status', 'Tenant Preferred', 'Point of Contact']
for feature in features:
df = one_hot_encode(df, feature)
df.sample(5)
BHK | Rent | Size | Floor | Area Locality | Bathroom | month posted | Carpet Area | Super Area | Chennai | Delhi | Hyderabad | Kolkata | Mumbai | Semi-Furnished | Unfurnished | Bachelors/Family | Family | Contact Builder | Contact Owner | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1643 | 2 | 8000 | 600 | 1 | Electronic City | 1 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 |
2931 | 3 | 90000 | 1550 | 2 | Safdarjung Enclave | 3 | 7 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3661 | 1 | 7000 | 400 | 0 | Nanmangalam | 1 | 6 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
3440 | 2 | 18000 | 1100 | 0 | Purasawalkam, PH Road | 2 | 6 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
3804 | 3 | 37000 | 1650 | 2 | Vadapalani | 3 | 5 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
df.drop('Area Locality',axis=1,inplace=True)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from scipy.stats import probplot, boxcox
from scipy.special import inv_boxcox
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
x = df.drop("Rent", axis = 1)
df['Rent'] = np.log(df['Rent'])
y = df.iloc[:,1]
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.20,random_state=40)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
lr = LinearRegression()
scores = cross_val_score(lr, x_test, y_test, scoring='r2', cv=5)
print(scores)
print(np.mean(scores))
[0.81063636 0.82698973 0.78587266 0.78989185 0.75429624] 0.7935373679222344
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
data_poly = poly.fit_transform(x_train)
lr = Ridge(alpha=5000)
np.mean(cross_val_score(lr, data_poly[:, 1:], y_train, scoring='r2', cv=5)).round(2)
0.74
rf = RandomForestRegressor(n_estimators=100, max_depth=10)
np.mean(cross_val_score(rf, data_poly[:, 1:], y_train, scoring='r2', cv=5)).round(2)
0.82
rf = RandomForestRegressor(n_estimators=100, max_depth=10)
rf.fit(data_poly[:, 1:], y_train)
y_pred = rf.predict(poly.transform(x_test)[:, 1:])
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
score
0.8067773953617802