import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('C:/Users/Sanchit/Downloads/crypto prices.csv')
df
Unnamed: 0 | Date | Adj Close (BNB) | Volume (BNB) | Adj Close (BTC) | Volume (BTC) | Adj Close (USDT) | Volume (USDT) | Adj Close (ETH) | Volume (ETH) | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 11/9/2017 | 1.990770 | 19192200 | 7143.580078 | 3226249984 | 1.008180 | 358188000 | 320.884003 | 893249984 |
1 | 1 | 11/10/2017 | 1.796840 | 11155000 | 6618.140137 | 5208249856 | 1.006010 | 756446016 | 299.252991 | 885985984 |
2 | 2 | 11/11/2017 | 1.670470 | 8178150 | 6357.600098 | 4908680192 | 1.008990 | 746227968 | 314.681000 | 842300992 |
3 | 3 | 11/12/2017 | 1.519690 | 15298700 | 5950.069824 | 8957349888 | 1.012470 | 1466060032 | 307.907990 | 1613479936 |
4 | 4 | 11/13/2017 | 1.686620 | 12238800 | 6559.490234 | 6263249920 | 1.009350 | 767884032 | 316.716003 | 1041889984 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1725 | 1725 | 7/31/2022 | 283.579468 | 1313531523 | 23336.896484 | 23553591896 | 1.000328 | 52267348020 | 1681.517334 | 14200735370 |
1726 | 1726 | 8/1/2022 | 283.539490 | 1314157614 | 23314.199219 | 25849159141 | 1.000204 | 50882756969 | 1635.195801 | 16191371176 |
1727 | 1727 | 8/2/2022 | 283.820984 | 1768344106 | 22978.117188 | 28389250717 | 1.000159 | 54793315279 | 1632.945435 | 20426082309 |
1728 | 1728 | 8/3/2022 | 298.356781 | 2133584480 | 22846.507813 | 26288169966 | 1.000204 | 47717439471 | 1618.874512 | 16786218830 |
1729 | 1729 | 8/4/2022 | 310.706055 | 1926587001 | 22858.423828 | 24817580032 | 1.000133 | 44526180493 | 1608.205811 | 14467440626 |
1730 rows × 10 columns
df.head()
Unnamed: 0 | Date | Adj Close (BNB) | Volume (BNB) | Adj Close (BTC) | Volume (BTC) | Adj Close (USDT) | Volume (USDT) | Adj Close (ETH) | Volume (ETH) | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 11/9/2017 | 1.99077 | 19192200 | 7143.580078 | 3226249984 | 1.00818 | 358188000 | 320.884003 | 893249984 |
1 | 1 | 11/10/2017 | 1.79684 | 11155000 | 6618.140137 | 5208249856 | 1.00601 | 756446016 | 299.252991 | 885985984 |
2 | 2 | 11/11/2017 | 1.67047 | 8178150 | 6357.600098 | 4908680192 | 1.00899 | 746227968 | 314.681000 | 842300992 |
3 | 3 | 11/12/2017 | 1.51969 | 15298700 | 5950.069824 | 8957349888 | 1.01247 | 1466060032 | 307.907990 | 1613479936 |
4 | 4 | 11/13/2017 | 1.68662 | 12238800 | 6559.490234 | 6263249920 | 1.00935 | 767884032 | 316.716003 | 1041889984 |
# Cheack how many rows and columns are there
df.shape
(1730, 10)
As we can see there were 1730 Rows and 10 Columns
# Use Info method to look for the information in the data
df.info
<bound method DataFrame.info of Unnamed: 0 Date Adj Close (BNB) Volume (BNB) Adj Close (BTC) \ 0 0 11/9/2017 1.990770 19192200 7143.580078 1 1 11/10/2017 1.796840 11155000 6618.140137 2 2 11/11/2017 1.670470 8178150 6357.600098 3 3 11/12/2017 1.519690 15298700 5950.069824 4 4 11/13/2017 1.686620 12238800 6559.490234 ... ... ... ... ... ... 1725 1725 7/31/2022 283.579468 1313531523 23336.896484 1726 1726 8/1/2022 283.539490 1314157614 23314.199219 1727 1727 8/2/2022 283.820984 1768344106 22978.117188 1728 1728 8/3/2022 298.356781 2133584480 22846.507813 1729 1729 8/4/2022 310.706055 1926587001 22858.423828 Volume (BTC) Adj Close (USDT) Volume (USDT) Adj Close (ETH) \ 0 3226249984 1.008180 358188000 320.884003 1 5208249856 1.006010 756446016 299.252991 2 4908680192 1.008990 746227968 314.681000 3 8957349888 1.012470 1466060032 307.907990 4 6263249920 1.009350 767884032 316.716003 ... ... ... ... ... 1725 23553591896 1.000328 52267348020 1681.517334 1726 25849159141 1.000204 50882756969 1635.195801 1727 28389250717 1.000159 54793315279 1632.945435 1728 26288169966 1.000204 47717439471 1618.874512 1729 24817580032 1.000133 44526180493 1608.205811 Volume (ETH) 0 893249984 1 885985984 2 842300992 3 1613479936 4 1041889984 ... ... 1725 14200735370 1726 16191371176 1727 20426082309 1728 16786218830 1729 14467440626 [1730 rows x 10 columns]>
# Description of Data
df.describe()
Unnamed: 0 | Adj Close (BNB) | Volume (BNB) | Adj Close (BTC) | Volume (BTC) | Adj Close (USDT) | Volume (USDT) | Adj Close (ETH) | Volume (ETH) | |
---|---|---|---|---|---|---|---|---|---|
count | 1730.0000 | 1730.000000 | 1.730000e+03 | 1730.000000 | 1.730000e+03 | 1730.000000 | 1.730000e+03 | 1730.000000 | 1.730000e+03 |
mean | 864.5000 | 134.960777 | 9.647885e+08 | 20191.519348 | 2.570080e+10 | 1.001717 | 4.085393e+10 | 1105.603148 | 1.288480e+10 |
std | 499.5523 | 185.725539 | 1.486405e+09 | 17507.045641 | 2.003526e+10 | 0.005928 | 3.912941e+10 | 1233.514214 | 1.104728e+10 |
min | 0.0000 | 1.510360 | 9.284000e+03 | 3236.761719 | 2.923670e+09 | 0.966644 | 3.581880e+08 | 84.308296 | 6.217330e+08 |
25% | 432.2500 | 13.670646 | 1.177502e+08 | 7457.858887 | 9.718123e+09 | 0.999968 | 4.705288e+09 | 203.758255 | 3.844413e+09 |
50% | 864.5000 | 22.234484 | 3.079566e+08 | 10330.514649 | 2.313310e+10 | 1.000601 | 3.348729e+10 | 436.047501 | 1.047600e+10 |
75% | 1296.7500 | 288.047844 | 1.467549e+09 | 35538.384766 | 3.518178e+10 | 1.002838 | 6.087353e+10 | 1828.478180 | 1.823330e+10 |
max | 1729.0000 | 675.684082 | 1.798295e+10 | 67566.828125 | 3.509679e+11 | 1.077880 | 2.790675e+11 | 4812.087402 | 8.448291e+10 |
df.isna().sum()
Unnamed: 0 0 Date 0 Adj Close (BNB) 0 Volume (BNB) 0 Adj Close (BTC) 0 Volume (BTC) 0 Adj Close (USDT) 0 Volume (USDT) 0 Adj Close (ETH) 0 Volume (ETH) 0 dtype: int64
As we can see there is no null values in our Dataset so we move upto Visualization Analysis
plt.figure(figsize = (25, 5))
sns.set_style('dark')
sns.lineplot(data=data)
<AxesSubplot:>
plt.figure(figsize = (20, 10))
sns.set_style('dark')
sns.lineplot(data = data['Adj Close (BNB)'], label = 'BNB')
sns.lineplot(data = data['Adj Close (ETH)'], label = 'ETH')
sns.lineplot(data = data['Adj Close (BTC)'], label = 'BTC')
sns.lineplot(data = data['Adj Close (USDT)'], label = 'USDT')
plt.title('Adjacent Close Price')
Text(0.5, 1.0, 'Adjacent Close Price')
plt.figure(figsize = (20, 10))
sns.set_style('dark')
sns.lineplot(data = data['Volume (BNB)'], label = 'BNB')
sns.lineplot(data = data['Volume (ETH)'], label = 'ETH')
sns.lineplot(data = data['Volume (BTC)'], label = 'BTC')
sns.lineplot(data = data['Volume (USDT)'], label = 'USDT')
plt.title('Volume')
Text(0.5, 1.0, 'Volume')
df.hist(figsize=(20, 8), layout=(4,8))
array([[<AxesSubplot:title={'center':'Unnamed: 0'}>, <AxesSubplot:title={'center':'Adj Close (BNB)'}>, <AxesSubplot:title={'center':'Volume (BNB)'}>, <AxesSubplot:title={'center':'Adj Close (BTC)'}>, <AxesSubplot:title={'center':'Volume (BTC)'}>, <AxesSubplot:title={'center':'Adj Close (USDT)'}>, <AxesSubplot:title={'center':'Volume (USDT)'}>, <AxesSubplot:title={'center':'Adj Close (ETH)'}>], [<AxesSubplot:title={'center':'Volume (ETH)'}>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>], [<AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>], [<AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>, <AxesSubplot:>]], dtype=object)
df.corr()
Unnamed: 0 | Adj Close (BNB) | Volume (BNB) | Adj Close (BTC) | Volume (BTC) | Adj Close (USDT) | Volume (USDT) | Adj Close (ETH) | Volume (ETH) | |
---|---|---|---|---|---|---|---|---|---|
Unnamed: 0 | 1.000000 | 0.761237 | 0.571730 | 0.743903 | 0.586429 | -0.181445 | 0.722177 | 0.697317 | 0.674547 |
Adj Close (BNB) | 0.761237 | 1.000000 | 0.732715 | 0.918562 | 0.407226 | -0.154652 | 0.659554 | 0.962406 | 0.569978 |
Volume (BNB) | 0.571730 | 0.732715 | 1.000000 | 0.768149 | 0.600409 | -0.117310 | 0.773426 | 0.654506 | 0.699198 |
Adj Close (BTC) | 0.743903 | 0.918562 | 0.768149 | 1.000000 | 0.573059 | -0.151540 | 0.773313 | 0.926972 | 0.672594 |
Volume (BTC) | 0.586429 | 0.407226 | 0.600409 | 0.573059 | 1.000000 | -0.095888 | 0.867511 | 0.383787 | 0.859965 |
Adj Close (USDT) | -0.181445 | -0.154652 | -0.117310 | -0.151540 | -0.095888 | 1.000000 | -0.139902 | -0.149685 | -0.119487 |
Volume (USDT) | 0.722177 | 0.659554 | 0.773426 | 0.773313 | 0.867511 | -0.139902 | 1.000000 | 0.629372 | 0.948139 |
Adj Close (ETH) | 0.697317 | 0.962406 | 0.654506 | 0.926972 | 0.383787 | -0.149685 | 0.629372 | 1.000000 | 0.545473 |
Volume (ETH) | 0.674547 | 0.569978 | 0.699198 | 0.672594 | 0.859965 | -0.119487 | 0.948139 | 0.545473 | 1.000000 |
plt.figure(figsize=(18,16))
sns.heatmap(data.corr(), annot=True, cmap=plt.cm.CMRmap_r);
sns.pairplot(data.sample(n=100));
X = df.loc[:, ['Adj Close (BNB)', 'Adj Close (USDT)', 'Adj Close (ETH)']]
Y = df.loc[:, 'Adj Close (BTC)']
X.head()
Adj Close (BNB) | Adj Close (USDT) | Adj Close (ETH) | |
---|---|---|---|
0 | 1.99077 | 1.00818 | 320.884003 |
1 | 1.79684 | 1.00601 | 299.252991 |
2 | 1.67047 | 1.00899 | 314.681000 |
3 | 1.51969 | 1.01247 | 307.907990 |
4 | 1.68662 | 1.00935 | 316.716003 |
Y.head()
0 7143.580078 1 6618.140137 2 6357.600098 3 5950.069824 4 6559.490234 Name: Adj Close (BTC), dtype: float64
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=2)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, Y_train)
KNeighborsRegressor(n_neighbors=2)
rf = RandomForestRegressor()
rf.fit(X_train,Y_train)
RandomForestRegressor()
dt = tree.DecisionTreeRegressor()
dt.fit(X_train, Y_train)
DecisionTreeRegressor()
est = GradientBoostingRegressor()
est.fit(X_train, Y_train)
GradientBoostingRegressor()
Y_pred_neigh = neigh.predict(X_test)
r2neigh = metrics.r2_score(Y_test, Y_pred_neigh)
Y_pred_rf = rf.predict(X_test)
r2rf = metrics.r2_score(Y_test, Y_pred_rf)
Y_pred_dt = dt.predict(X_test)
r2dt = metrics.r2_score(Y_test, Y_pred_dt)
Y_pred_est = est.predict(X_test)
r2est = metrics.r2_score(Y_test, Y_pred_est)
print("*"*10, "Accuracy", "*"*10)
print("-"*30)
print("K nearest neighbors: ", r2neigh)
print("-"*30)
print("-"*30)
print("random forest: ", r2rf)
print("-"*30)
print("-"*30)
print("decision tree: ", r2dt)
print("-"*30)
print("-"*30)
print("gradient boosting: ", r2est)
print("-"*30)
********** Accuracy ********** ------------------------------ K nearest neighbors: 0.9632261125895967 ------------------------------ ------------------------------ random forest: 0.9683946505295005 ------------------------------ ------------------------------ decision tree: 0.9399728872719846 ------------------------------ ------------------------------ gradient boosting: 0.9617905997973549 ------------------------------
As we can see Random Forest is the one which is giving us the best accuracy
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 100)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(10, 200, num = 20)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print(random_grid)
{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990, 1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
rf = RandomForestRegressor() rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train)
rf = RandomForestRegressor(**rf_random.best_params_)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)
r2rf = metrics.r2_score(Y_test, Y_pred_rf)
print("-"*30)
print("Accuracy: ", r2rf)
print("-"*30)
------------------------------ Accuracy: 0.968881458729958 ------------------------------