import pandas as pd
= pd.read_csv("data.csv") housing
#return top 5 rows housing.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
#gives information about each column in csv housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CRIM 506 non-null float64
1 ZN 506 non-null float64
2 INDUS 506 non-null float64
3 CHAS 506 non-null int64
4 NOX 506 non-null float64
5 RM 501 non-null float64
6 AGE 506 non-null float64
7 DIS 506 non-null float64
8 RAD 506 non-null int64
9 TAX 506 non-null int64
10 PTRATIO 506 non-null float64
11 B 506 non-null float64
12 LSTAT 506 non-null float64
13 MEDV 506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
'CHAS'].value_counts() # return each type of value in selected column along with its counts housing[
0 471
1 35
Name: CHAS, dtype: int64
'NOX'].value_counts() housing[
0.538 23
0.713 18
0.437 17
0.871 16
0.624 15
..
0.394 1
0.518 1
0.385 1
0.389 1
0.435 1
Name: NOX, Length: 81, dtype: int64
#returns 1. count (ignores null values) 2. Mean 3. Standard Deviation 4. Min and Max values 5. How much percentage of values are less than represented value housing.describe()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 501.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 |
mean | 3.613524 | 11.363636 | 11.136779 | 0.069170 | 0.554695 | 6.285850 | 68.574901 | 3.795043 | 9.549407 | 408.237154 | 18.455534 | 356.674032 | 12.653063 | 22.532806 |
std | 8.601545 | 23.322453 | 6.860353 | 0.253994 | 0.115878 | 0.701639 | 28.148861 | 2.105710 | 8.707259 | 168.537116 | 2.164946 | 91.294864 | 7.141062 | 9.197104 |
min | 0.006320 | 0.000000 | 0.460000 | 0.000000 | 0.385000 | 3.561000 | 2.900000 | 1.129600 | 1.000000 | 187.000000 | 12.600000 | 0.320000 | 1.730000 | 5.000000 |
25% | 0.082045 | 0.000000 | 5.190000 | 0.000000 | 0.449000 | 5.885000 | 45.025000 | 2.100175 | 4.000000 | 279.000000 | 17.400000 | 375.377500 | 6.950000 | 17.025000 |
50% | 0.256510 | 0.000000 | 9.690000 | 0.000000 | 0.538000 | 6.202000 | 77.500000 | 3.207450 | 5.000000 | 330.000000 | 19.050000 | 391.440000 | 11.360000 | 21.200000 |
75% | 3.677083 | 12.500000 | 18.100000 | 0.000000 | 0.624000 | 6.625000 | 94.075000 | 5.188425 | 24.000000 | 666.000000 | 20.200000 | 396.225000 | 16.955000 | 25.000000 |
max | 88.976200 | 100.000000 | 27.740000 | 1.000000 | 0.871000 | 8.780000 | 100.000000 | 12.126500 | 24.000000 | 711.000000 | 22.000000 | 396.900000 | 37.970000 | 50.000000 |
%matplotlib inline
#Shows graphs at present i.e. during execution
import matplotlib.pyplot as plt
= 50, figsize=(20,15)) housing.hist(bins
array([[<AxesSubplot:title={'center':'CRIM'}>,
<AxesSubplot:title={'center':'ZN'}>,
<AxesSubplot:title={'center':'INDUS'}>,
<AxesSubplot:title={'center':'CHAS'}>],
[<AxesSubplot:title={'center':'NOX'}>,
<AxesSubplot:title={'center':'RM'}>,
<AxesSubplot:title={'center':'AGE'}>,
<AxesSubplot:title={'center':'DIS'}>],
[<AxesSubplot:title={'center':'RAD'}>,
<AxesSubplot:title={'center':'TAX'}>,
<AxesSubplot:title={'center':'PTRATIO'}>,
<AxesSubplot:title={'center':'B'}>],
[<AxesSubplot:title={'center':'LSTAT'}>,
<AxesSubplot:title={'center':'MEDV'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)
import numpy as np
#for learning purpose as it is available in scikit learn
def split_train_test(data,test_ratio): #test ratio is the part of data that needs to be preserved for testing and is not involved in training
42) #It fixes the random shuffled values thus preventing overfitting i.e. leaking test data into training data
np.random.seed(=np.random.permutation(len(data)) #length of the data is permutated randomly shuffled with indices
shuffled= int(len(data)*test_ratio)
test_set_size = shuffled[:test_set_size]
test_indices = shuffled[test_set_size:]
train_indices return data.iloc[train_indices],data.iloc[test_indices]
#train_set, test_set=split_train_test(housing,0.2)
#print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")
from sklearn.model_selection import train_test_split
= train_test_split(housing, test_size=0.2, random_state=42) train_set, test_set
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")
Rows in train set: 404
Rows in test set: 102
from sklearn.model_selection import StratifiedShuffleSplit
= StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
split for train_index, test_index in split.split(housing,housing['CHAS']):
= housing.loc[train_index]
strat_train_set = housing.loc[test_index] strat_test_set
strat_test_set
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
342 | 0.02498 | 0.0 | 1.89 | 0 | 0.518 | 6.540 | 59.7 | 6.2669 | 1 | 422 | 15.9 | 389.96 | 8.65 | 16.5 |
379 | 17.86670 | 0.0 | 18.10 | 0 | 0.671 | 6.223 | 100.0 | 1.3861 | 24 | 666 | 20.2 | 393.74 | 21.78 | 10.2 |
223 | 0.61470 | 0.0 | 6.20 | 0 | 0.507 | 6.618 | 80.8 | 3.2721 | 8 | 307 | 17.4 | 396.90 | 7.60 | 30.1 |
219 | 0.11425 | 0.0 | 13.89 | 1 | 0.550 | 6.373 | 92.4 | 3.3633 | 5 | 276 | 16.4 | 393.74 | 10.50 | 23.0 |
48 | 0.25387 | 0.0 | 6.91 | 0 | 0.448 | 5.399 | 95.3 | 5.8700 | 3 | 233 | 17.9 | 396.90 | 30.81 | 14.4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
88 | 0.05660 | 0.0 | 3.41 | 0 | 0.489 | 7.007 | 86.3 | 3.4217 | 2 | 270 | 17.8 | 396.90 | 5.50 | 23.6 |
466 | 3.77498 | 0.0 | 18.10 | 0 | 0.655 | 5.952 | 84.7 | 2.8715 | 24 | 666 | 20.2 | 22.01 | 17.15 | 19.0 |
52 | 0.05360 | 21.0 | 5.64 | 0 | 0.439 | 6.511 | 21.1 | 6.8147 | 4 | 243 | 16.8 | 396.90 | 5.28 | 25.0 |
121 | 0.07165 | 0.0 | 25.65 | 0 | 0.581 | 6.004 | 84.1 | 2.1974 | 2 | 188 | 19.1 | 377.67 | 14.27 | 20.3 |
218 | 0.11069 | 0.0 | 13.89 | 1 | 0.550 | 5.951 | 93.8 | 2.8893 | 5 | 276 | 16.4 | 396.90 | 17.92 | 21.5 |
102 rows × 14 columns
'CHAS'].value_counts() strat_test_set[
0 95
1 7
Name: CHAS, dtype: int64
'CHAS'].value_counts() strat_train_set[
0 376
1 28
Name: CHAS, dtype: int64
= strat_train_set.copy() #Copying the training data to housing attribute housing
= housing.corr() corr_matrix
'MEDV'].sort_values(ascending = False) corr_matrix[
MEDV 1.000000
RM 0.683023
B 0.361761
ZN 0.339741
DIS 0.240451
CHAS 0.205066
AGE -0.364596
RAD -0.374693
CRIM -0.393715
NOX -0.422873
TAX -0.456657
INDUS -0.473516
PTRATIO -0.493534
LSTAT -0.740494
Name: MEDV, dtype: float64
from pandas.plotting import scatter_matrix
= ["MEDV","RM","ZN","LSTAT"]
attributes =(12,8)) scatter_matrix(housing[attributes], figsize
array([[<AxesSubplot:xlabel='MEDV', ylabel='MEDV'>,
<AxesSubplot:xlabel='RM', ylabel='MEDV'>,
<AxesSubplot:xlabel='ZN', ylabel='MEDV'>,
<AxesSubplot:xlabel='LSTAT', ylabel='MEDV'>],
[<AxesSubplot:xlabel='MEDV', ylabel='RM'>,
<AxesSubplot:xlabel='RM', ylabel='RM'>,
<AxesSubplot:xlabel='ZN', ylabel='RM'>,
<AxesSubplot:xlabel='LSTAT', ylabel='RM'>],
[<AxesSubplot:xlabel='MEDV', ylabel='ZN'>,
<AxesSubplot:xlabel='RM', ylabel='ZN'>,
<AxesSubplot:xlabel='ZN', ylabel='ZN'>,
<AxesSubplot:xlabel='LSTAT', ylabel='ZN'>],
[<AxesSubplot:xlabel='MEDV', ylabel='LSTAT'>,
<AxesSubplot:xlabel='RM', ylabel='LSTAT'>,
<AxesSubplot:xlabel='ZN', ylabel='LSTAT'>,
<AxesSubplot:xlabel='LSTAT', ylabel='LSTAT'>]], dtype=object)
="scatter",x="RM",y="MEDV",alpha=0.8) housing.plot(kind
<AxesSubplot:xlabel='RM', ylabel='MEDV'>
"TAXRM"] = housing["TAX"]/housing["RM"] housing[
"TAXRM"] housing[
254 51.571709
348 42.200452
476 102.714374
321 45.012547
326 45.468948
...
155 65.507152
423 109.126659
98 35.294118
455 102.068966
216 46.875000
Name: TAXRM, Length: 404, dtype: float64
housing.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | TAXRM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
254 | 0.04819 | 80.0 | 3.64 | 0 | 0.392 | 6.108 | 32.0 | 9.2203 | 1 | 315 | 16.4 | 392.89 | 6.57 | 21.9 | 51.571709 |
348 | 0.01501 | 80.0 | 2.01 | 0 | 0.435 | 6.635 | 29.7 | 8.3440 | 4 | 280 | 17.0 | 390.94 | 5.99 | 24.5 | 42.200452 |
476 | 4.87141 | 0.0 | 18.10 | 0 | 0.614 | 6.484 | 93.6 | 2.3053 | 24 | 666 | 20.2 | 396.21 | 18.68 | 16.7 | 102.714374 |
321 | 0.18159 | 0.0 | 7.38 | 0 | 0.493 | 6.376 | 54.3 | 4.5404 | 5 | 287 | 19.6 | 396.90 | 6.87 | 23.1 | 45.012547 |
326 | 0.30347 | 0.0 | 7.38 | 0 | 0.493 | 6.312 | 28.9 | 5.4159 | 5 | 287 | 19.6 | 396.90 | 6.15 | 23.0 | 45.468948 |
= housing.corr()
corr_matrix 'MEDV'].sort_values(ascending = False) corr_matrix[
MEDV 1.000000
RM 0.683023
B 0.361761
ZN 0.339741
DIS 0.240451
CHAS 0.205066
AGE -0.364596
RAD -0.374693
CRIM -0.393715
NOX -0.422873
TAX -0.456657
INDUS -0.473516
PTRATIO -0.493534
TAXRM -0.528591
LSTAT -0.740494
Name: MEDV, dtype: float64
="scatter",x="TAXRM",y="MEDV",alpha=0.8) housing.plot(kind
<AxesSubplot:xlabel='TAXRM', ylabel='MEDV'>
= strat_train_set.drop("MEDV", axis=1)
housing = strat_train_set["MEDV"].copy() housing_labels
#option 1
=["RM"]).shape
housing.dropna(subset# Note: No data in original dataframe is harmed until True attribute is not given.
(400, 13)
#option 2
"RM",axis=1).shape
housing.drop(# Note: No data in original dataframe is harmed.
(404, 12)
= housing["RM"].median()
median "RM"].fillna(median)
housing[# Note: No data in original dataframe is harmed.
254 6.108
348 6.635
476 6.484
321 6.376
326 6.312
...
155 6.152
423 6.103
98 7.820
455 6.525
216 5.888
Name: RM, Length: 404, dtype: float64
#Before imputing/filling missing attributes. housing.describe()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 400.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 |
mean | 3.602814 | 10.836634 | 11.344950 | 0.069307 | 0.558064 | 6.282808 | 69.039851 | 3.746210 | 9.735149 | 412.341584 | 18.473267 | 353.392822 | 12.791609 |
std | 8.099383 | 22.150636 | 6.877817 | 0.254290 | 0.116875 | 0.711595 | 28.258248 | 2.099057 | 8.731259 | 168.672623 | 2.129243 | 96.069235 | 7.235740 |
min | 0.006320 | 0.000000 | 0.740000 | 0.000000 | 0.389000 | 3.561000 | 2.900000 | 1.129600 | 1.000000 | 187.000000 | 13.000000 | 0.320000 | 1.730000 |
25% | 0.086962 | 0.000000 | 5.190000 | 0.000000 | 0.453000 | 5.878750 | 44.850000 | 2.035975 | 4.000000 | 284.000000 | 17.400000 | 374.617500 | 6.847500 |
50% | 0.286735 | 0.000000 | 9.900000 | 0.000000 | 0.538000 | 6.209000 | 78.200000 | 3.122200 | 5.000000 | 337.000000 | 19.000000 | 390.955000 | 11.570000 |
75% | 3.731923 | 12.500000 | 18.100000 | 0.000000 | 0.631000 | 6.632000 | 94.100000 | 5.100400 | 24.000000 | 666.000000 | 20.200000 | 395.630000 | 17.102500 |
max | 73.534100 | 100.000000 | 27.740000 | 1.000000 | 0.871000 | 8.780000 | 100.000000 | 12.126500 | 24.000000 | 711.000000 | 22.000000 | 396.900000 | 36.980000 |
from sklearn.impute import SimpleImputer
= SimpleImputer(strategy="median")
imputer imputer.fit(housing)
SimpleImputer(strategy='median')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SimpleImputer(strategy='median')
#Shows each column median imputer.statistics_
array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
6.20900e+00, 7.82000e+01, 3.12220e+00, 5.00000e+00, 3.37000e+02,
1.90000e+01, 3.90955e+02, 1.15700e+01])
= imputer.transform(housing) X
= pd.DataFrame(X, columns=housing.columns) # It is a transformed dataframe without missing values housing_tr
housing_tr.describe()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 |
mean | 3.602814 | 10.836634 | 11.344950 | 0.069307 | 0.558064 | 6.282077 | 69.039851 | 3.746210 | 9.735149 | 412.341584 | 18.473267 | 353.392822 | 12.791609 |
std | 8.099383 | 22.150636 | 6.877817 | 0.254290 | 0.116875 | 0.708093 | 28.258248 | 2.099057 | 8.731259 | 168.672623 | 2.129243 | 96.069235 | 7.235740 |
min | 0.006320 | 0.000000 | 0.740000 | 0.000000 | 0.389000 | 3.561000 | 2.900000 | 1.129600 | 1.000000 | 187.000000 | 13.000000 | 0.320000 | 1.730000 |
25% | 0.086962 | 0.000000 | 5.190000 | 0.000000 | 0.453000 | 5.879750 | 44.850000 | 2.035975 | 4.000000 | 284.000000 | 17.400000 | 374.617500 | 6.847500 |
50% | 0.286735 | 0.000000 | 9.900000 | 0.000000 | 0.538000 | 6.209000 | 78.200000 | 3.122200 | 5.000000 | 337.000000 | 19.000000 | 390.955000 | 11.570000 |
75% | 3.731923 | 12.500000 | 18.100000 | 0.000000 | 0.631000 | 6.630250 | 94.100000 | 5.100400 | 24.000000 | 666.000000 | 20.200000 | 395.630000 | 17.102500 |
max | 73.534100 | 100.000000 | 27.740000 | 1.000000 | 0.871000 | 8.780000 | 100.000000 | 12.126500 | 24.000000 | 711.000000 | 22.000000 | 396.900000 | 36.980000 |
It has primarily three types of objescts:
Primarily, there are two types of features scaling mehods:
Min-Max (Normalization) (value - min)/(max - min) sklearn provides a class called MinMaxScaler
Standardization (value-mean)/std std means standard deviation sklearn provides a class called Standard Scaler
It is used for a series of atomation.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
= Pipeline([
my_pipeline 'imputer',SimpleImputer(strategy="median")),
(#.... add as many pipelines as needed
'std_scalar',StandardScaler()),
( ])
= my_pipeline.fit_transform(housing_tr) housing_num_tr
#returns numpy array housing_num_tr
array([[-0.43942006, 3.12628155, -1.12165014, ..., -0.97491834,
0.41164221, -0.86091034],
[-0.44352175, 3.12628155, -1.35893781, ..., -0.69277865,
0.39131918, -0.94116739],
[ 0.15682292, -0.4898311 , 0.98336806, ..., 0.81196637,
0.44624347, 0.81480158],
...,
[-0.43525657, -0.4898311 , -1.23083158, ..., -0.22254583,
0.41831233, -1.27603303],
[ 0.14210728, -0.4898311 , 0.98336806, ..., 0.81196637,
-3.15239177, 0.73869575],
[-0.43974024, -0.4898311 , 0.37049623, ..., -0.97491834,
0.41070422, 0.09940681]])
housing_num_tr.shape
(404, 13)
#from sklearn.linear_model import LinearRegression
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor # choosen model
#model = LinearRegression() Discarded model due to heavy error of 22.8
#model = DecisionTreeRegressor()
= RandomForestRegressor()
model model.fit(housing_num_tr, housing_labels)
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
= housing.iloc[:5] some_data
= housing_labels.iloc[:5] some_labels
= my_pipeline.transform(some_data) prepared_data
#Gives pedicted values in the form of an array model.predict(prepared_data)
array([22.317, 25.696, 16.502, 23.511, 23.583])
list(some_labels) #These are the expected values
[21.9, 24.5, 16.7, 23.1, 23.0]
from sklearn.metrics import mean_squared_error
= model.predict(housing_num_tr)
housing_predictions #lin_mse = mean_squared_error(housing_labels, housing_predictions) linear mean squared error
#lin_rmse = np.sqrt(lin_mse) root mean squared error
= mean_squared_error(housing_labels, housing_predictions) # mean squared error
mse = np.sqrt(mse) # root mean squared error rmse
#The error is 0. It meams, this model has learned the dataset along with factors like noise. It is called overfitting. Our model needs to follow trend and not noise. Hence, we need to avoid the condition of overfitting or underfitting. rmse
1.3030784295522488
1,2,3,4,5,6,7,8,9,10 We will do stepwise testing for errors. Like, in first round, we will eliminate 1, and test rest of the cases, than eliminate 1,2 and test rest of the cases and so on...
from sklearn.model_selection import cross_val_score
= cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
scores = np.sqrt(-scores) rmse_scores
rmse_scores
array([2.76236684, 2.72915525, 4.43665258, 2.73340914, 3.40079296,
2.61371069, 5.15244869, 3.28961837, 3.02669383, 3.1373396 ])
def print_scores(scores):
print("Scores: ",scores)
print("Mean: ",scores.mean())
print("Standard Deviation: ",scores.std())
#best model = RandomForestRegressor print_scores(rmse_scores)
Scores: [2.76236684 2.72915525 4.43665258 2.73340914 3.40079296 2.61371069
5.15244869 3.28961837 3.02669383 3.1373396 ]
Mean: 3.3282187957275076
Standard Deviation: 0.789552738177566
from joblib import dump,load
"DRL.joblib") dump(model,
['DRL.joblib']
= strat_test_set.drop("MEDV", axis=1) #Dropping label
X_test = strat_test_set["MEDV"].copy()
Y_test = my_pipeline.transform(X_test)
X_test_prepared = model.predict(X_test_prepared)
final_predictions = mean_squared_error(Y_test, final_predictions)
final_mse = np.sqrt(final_mse) final_rmse
#The lesser the rmse, the better is the model. But it should not be 0. final_rmse
3.0763381131264658