import pandas as pdhousing = pd.read_csv("data.csv")housing.head() #return top 5 rows| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
| 1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
| 2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
| 3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
| 4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
housing.info() #gives information about each column in csv<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CRIM 506 non-null float64
1 ZN 506 non-null float64
2 INDUS 506 non-null float64
3 CHAS 506 non-null int64
4 NOX 506 non-null float64
5 RM 501 non-null float64
6 AGE 506 non-null float64
7 DIS 506 non-null float64
8 RAD 506 non-null int64
9 TAX 506 non-null int64
10 PTRATIO 506 non-null float64
11 B 506 non-null float64
12 LSTAT 506 non-null float64
13 MEDV 506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
housing['CHAS'].value_counts() # return each type of value in selected column along with its counts0 471
1 35
Name: CHAS, dtype: int64
housing['NOX'].value_counts()0.538 23
0.713 18
0.437 17
0.871 16
0.624 15
..
0.394 1
0.518 1
0.385 1
0.389 1
0.435 1
Name: NOX, Length: 81, dtype: int64
housing.describe() #returns 1. count (ignores null values) 2. Mean 3. Standard Deviation 4. Min and Max values 5. How much percentage of values are less than represented value| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 501.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 | 506.000000 |
| mean | 3.613524 | 11.363636 | 11.136779 | 0.069170 | 0.554695 | 6.285850 | 68.574901 | 3.795043 | 9.549407 | 408.237154 | 18.455534 | 356.674032 | 12.653063 | 22.532806 |
| std | 8.601545 | 23.322453 | 6.860353 | 0.253994 | 0.115878 | 0.701639 | 28.148861 | 2.105710 | 8.707259 | 168.537116 | 2.164946 | 91.294864 | 7.141062 | 9.197104 |
| min | 0.006320 | 0.000000 | 0.460000 | 0.000000 | 0.385000 | 3.561000 | 2.900000 | 1.129600 | 1.000000 | 187.000000 | 12.600000 | 0.320000 | 1.730000 | 5.000000 |
| 25% | 0.082045 | 0.000000 | 5.190000 | 0.000000 | 0.449000 | 5.885000 | 45.025000 | 2.100175 | 4.000000 | 279.000000 | 17.400000 | 375.377500 | 6.950000 | 17.025000 |
| 50% | 0.256510 | 0.000000 | 9.690000 | 0.000000 | 0.538000 | 6.202000 | 77.500000 | 3.207450 | 5.000000 | 330.000000 | 19.050000 | 391.440000 | 11.360000 | 21.200000 |
| 75% | 3.677083 | 12.500000 | 18.100000 | 0.000000 | 0.624000 | 6.625000 | 94.075000 | 5.188425 | 24.000000 | 666.000000 | 20.200000 | 396.225000 | 16.955000 | 25.000000 |
| max | 88.976200 | 100.000000 | 27.740000 | 1.000000 | 0.871000 | 8.780000 | 100.000000 | 12.126500 | 24.000000 | 711.000000 | 22.000000 | 396.900000 | 37.970000 | 50.000000 |
%matplotlib inline
#Shows graphs at present i.e. during executionimport matplotlib.pyplot as plthousing.hist(bins = 50, figsize=(20,15))array([[<AxesSubplot:title={'center':'CRIM'}>,
<AxesSubplot:title={'center':'ZN'}>,
<AxesSubplot:title={'center':'INDUS'}>,
<AxesSubplot:title={'center':'CHAS'}>],
[<AxesSubplot:title={'center':'NOX'}>,
<AxesSubplot:title={'center':'RM'}>,
<AxesSubplot:title={'center':'AGE'}>,
<AxesSubplot:title={'center':'DIS'}>],
[<AxesSubplot:title={'center':'RAD'}>,
<AxesSubplot:title={'center':'TAX'}>,
<AxesSubplot:title={'center':'PTRATIO'}>,
<AxesSubplot:title={'center':'B'}>],
[<AxesSubplot:title={'center':'LSTAT'}>,
<AxesSubplot:title={'center':'MEDV'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)

import numpy as np#for learning purpose as it is available in scikit learn
def split_train_test(data,test_ratio): #test ratio is the part of data that needs to be preserved for testing and is not involved in training
np.random.seed(42) #It fixes the random shuffled values thus preventing overfitting i.e. leaking test data into training data
shuffled=np.random.permutation(len(data)) #length of the data is permutated randomly shuffled with indices
test_set_size = int(len(data)*test_ratio)
test_indices = shuffled[:test_set_size]
train_indices = shuffled[test_set_size:]
return data.iloc[train_indices],data.iloc[test_indices]#train_set, test_set=split_train_test(housing,0.2)#print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")Rows in train set: 404
Rows in test set: 102
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing,housing['CHAS']):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]strat_test_set| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 342 | 0.02498 | 0.0 | 1.89 | 0 | 0.518 | 6.540 | 59.7 | 6.2669 | 1 | 422 | 15.9 | 389.96 | 8.65 | 16.5 |
| 379 | 17.86670 | 0.0 | 18.10 | 0 | 0.671 | 6.223 | 100.0 | 1.3861 | 24 | 666 | 20.2 | 393.74 | 21.78 | 10.2 |
| 223 | 0.61470 | 0.0 | 6.20 | 0 | 0.507 | 6.618 | 80.8 | 3.2721 | 8 | 307 | 17.4 | 396.90 | 7.60 | 30.1 |
| 219 | 0.11425 | 0.0 | 13.89 | 1 | 0.550 | 6.373 | 92.4 | 3.3633 | 5 | 276 | 16.4 | 393.74 | 10.50 | 23.0 |
| 48 | 0.25387 | 0.0 | 6.91 | 0 | 0.448 | 5.399 | 95.3 | 5.8700 | 3 | 233 | 17.9 | 396.90 | 30.81 | 14.4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 88 | 0.05660 | 0.0 | 3.41 | 0 | 0.489 | 7.007 | 86.3 | 3.4217 | 2 | 270 | 17.8 | 396.90 | 5.50 | 23.6 |
| 466 | 3.77498 | 0.0 | 18.10 | 0 | 0.655 | 5.952 | 84.7 | 2.8715 | 24 | 666 | 20.2 | 22.01 | 17.15 | 19.0 |
| 52 | 0.05360 | 21.0 | 5.64 | 0 | 0.439 | 6.511 | 21.1 | 6.8147 | 4 | 243 | 16.8 | 396.90 | 5.28 | 25.0 |
| 121 | 0.07165 | 0.0 | 25.65 | 0 | 0.581 | 6.004 | 84.1 | 2.1974 | 2 | 188 | 19.1 | 377.67 | 14.27 | 20.3 |
| 218 | 0.11069 | 0.0 | 13.89 | 1 | 0.550 | 5.951 | 93.8 | 2.8893 | 5 | 276 | 16.4 | 396.90 | 17.92 | 21.5 |
102 rows × 14 columns
strat_test_set['CHAS'].value_counts()0 95
1 7
Name: CHAS, dtype: int64
strat_train_set['CHAS'].value_counts()0 376
1 28
Name: CHAS, dtype: int64
housing = strat_train_set.copy() #Copying the training data to housing attribute corr_matrix = housing.corr()corr_matrix['MEDV'].sort_values(ascending = False)MEDV 1.000000
RM 0.683023
B 0.361761
ZN 0.339741
DIS 0.240451
CHAS 0.205066
AGE -0.364596
RAD -0.374693
CRIM -0.393715
NOX -0.422873
TAX -0.456657
INDUS -0.473516
PTRATIO -0.493534
LSTAT -0.740494
Name: MEDV, dtype: float64
from pandas.plotting import scatter_matrix
attributes = ["MEDV","RM","ZN","LSTAT"]
scatter_matrix(housing[attributes], figsize=(12,8))array([[<AxesSubplot:xlabel='MEDV', ylabel='MEDV'>,
<AxesSubplot:xlabel='RM', ylabel='MEDV'>,
<AxesSubplot:xlabel='ZN', ylabel='MEDV'>,
<AxesSubplot:xlabel='LSTAT', ylabel='MEDV'>],
[<AxesSubplot:xlabel='MEDV', ylabel='RM'>,
<AxesSubplot:xlabel='RM', ylabel='RM'>,
<AxesSubplot:xlabel='ZN', ylabel='RM'>,
<AxesSubplot:xlabel='LSTAT', ylabel='RM'>],
[<AxesSubplot:xlabel='MEDV', ylabel='ZN'>,
<AxesSubplot:xlabel='RM', ylabel='ZN'>,
<AxesSubplot:xlabel='ZN', ylabel='ZN'>,
<AxesSubplot:xlabel='LSTAT', ylabel='ZN'>],
[<AxesSubplot:xlabel='MEDV', ylabel='LSTAT'>,
<AxesSubplot:xlabel='RM', ylabel='LSTAT'>,
<AxesSubplot:xlabel='ZN', ylabel='LSTAT'>,
<AxesSubplot:xlabel='LSTAT', ylabel='LSTAT'>]], dtype=object)

housing.plot(kind="scatter",x="RM",y="MEDV",alpha=0.8)<AxesSubplot:xlabel='RM', ylabel='MEDV'>

housing["TAXRM"] = housing["TAX"]/housing["RM"]housing["TAXRM"]254 51.571709
348 42.200452
476 102.714374
321 45.012547
326 45.468948
...
155 65.507152
423 109.126659
98 35.294118
455 102.068966
216 46.875000
Name: TAXRM, Length: 404, dtype: float64
housing.head()| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | TAXRM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 254 | 0.04819 | 80.0 | 3.64 | 0 | 0.392 | 6.108 | 32.0 | 9.2203 | 1 | 315 | 16.4 | 392.89 | 6.57 | 21.9 | 51.571709 |
| 348 | 0.01501 | 80.0 | 2.01 | 0 | 0.435 | 6.635 | 29.7 | 8.3440 | 4 | 280 | 17.0 | 390.94 | 5.99 | 24.5 | 42.200452 |
| 476 | 4.87141 | 0.0 | 18.10 | 0 | 0.614 | 6.484 | 93.6 | 2.3053 | 24 | 666 | 20.2 | 396.21 | 18.68 | 16.7 | 102.714374 |
| 321 | 0.18159 | 0.0 | 7.38 | 0 | 0.493 | 6.376 | 54.3 | 4.5404 | 5 | 287 | 19.6 | 396.90 | 6.87 | 23.1 | 45.012547 |
| 326 | 0.30347 | 0.0 | 7.38 | 0 | 0.493 | 6.312 | 28.9 | 5.4159 | 5 | 287 | 19.6 | 396.90 | 6.15 | 23.0 | 45.468948 |
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending = False)MEDV 1.000000
RM 0.683023
B 0.361761
ZN 0.339741
DIS 0.240451
CHAS 0.205066
AGE -0.364596
RAD -0.374693
CRIM -0.393715
NOX -0.422873
TAX -0.456657
INDUS -0.473516
PTRATIO -0.493534
TAXRM -0.528591
LSTAT -0.740494
Name: MEDV, dtype: float64
housing.plot(kind="scatter",x="TAXRM",y="MEDV",alpha=0.8)<AxesSubplot:xlabel='TAXRM', ylabel='MEDV'>

housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()#option 1
housing.dropna(subset=["RM"]).shape
# Note: No data in original dataframe is harmed until True attribute is not given.(400, 13)
#option 2
housing.drop("RM",axis=1).shape
# Note: No data in original dataframe is harmed.(404, 12)
median = housing["RM"].median()
housing["RM"].fillna(median)
# Note: No data in original dataframe is harmed.254 6.108
348 6.635
476 6.484
321 6.376
326 6.312
...
155 6.152
423 6.103
98 7.820
455 6.525
216 5.888
Name: RM, Length: 404, dtype: float64
housing.describe() #Before imputing/filling missing attributes.| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 400.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 |
| mean | 3.602814 | 10.836634 | 11.344950 | 0.069307 | 0.558064 | 6.282808 | 69.039851 | 3.746210 | 9.735149 | 412.341584 | 18.473267 | 353.392822 | 12.791609 |
| std | 8.099383 | 22.150636 | 6.877817 | 0.254290 | 0.116875 | 0.711595 | 28.258248 | 2.099057 | 8.731259 | 168.672623 | 2.129243 | 96.069235 | 7.235740 |
| min | 0.006320 | 0.000000 | 0.740000 | 0.000000 | 0.389000 | 3.561000 | 2.900000 | 1.129600 | 1.000000 | 187.000000 | 13.000000 | 0.320000 | 1.730000 |
| 25% | 0.086962 | 0.000000 | 5.190000 | 0.000000 | 0.453000 | 5.878750 | 44.850000 | 2.035975 | 4.000000 | 284.000000 | 17.400000 | 374.617500 | 6.847500 |
| 50% | 0.286735 | 0.000000 | 9.900000 | 0.000000 | 0.538000 | 6.209000 | 78.200000 | 3.122200 | 5.000000 | 337.000000 | 19.000000 | 390.955000 | 11.570000 |
| 75% | 3.731923 | 12.500000 | 18.100000 | 0.000000 | 0.631000 | 6.632000 | 94.100000 | 5.100400 | 24.000000 | 666.000000 | 20.200000 | 395.630000 | 17.102500 |
| max | 73.534100 | 100.000000 | 27.740000 | 1.000000 | 0.871000 | 8.780000 | 100.000000 | 12.126500 | 24.000000 | 711.000000 | 22.000000 | 396.900000 | 36.980000 |
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)SimpleImputer(strategy='median')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SimpleImputer(strategy='median')
imputer.statistics_ #Shows each column medianarray([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
6.20900e+00, 7.82000e+01, 3.12220e+00, 5.00000e+00, 3.37000e+02,
1.90000e+01, 3.90955e+02, 1.15700e+01])
X = imputer.transform(housing)housing_tr = pd.DataFrame(X, columns=housing.columns) # It is a transformed dataframe without missing valueshousing_tr.describe()| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 | 404.000000 |
| mean | 3.602814 | 10.836634 | 11.344950 | 0.069307 | 0.558064 | 6.282077 | 69.039851 | 3.746210 | 9.735149 | 412.341584 | 18.473267 | 353.392822 | 12.791609 |
| std | 8.099383 | 22.150636 | 6.877817 | 0.254290 | 0.116875 | 0.708093 | 28.258248 | 2.099057 | 8.731259 | 168.672623 | 2.129243 | 96.069235 | 7.235740 |
| min | 0.006320 | 0.000000 | 0.740000 | 0.000000 | 0.389000 | 3.561000 | 2.900000 | 1.129600 | 1.000000 | 187.000000 | 13.000000 | 0.320000 | 1.730000 |
| 25% | 0.086962 | 0.000000 | 5.190000 | 0.000000 | 0.453000 | 5.879750 | 44.850000 | 2.035975 | 4.000000 | 284.000000 | 17.400000 | 374.617500 | 6.847500 |
| 50% | 0.286735 | 0.000000 | 9.900000 | 0.000000 | 0.538000 | 6.209000 | 78.200000 | 3.122200 | 5.000000 | 337.000000 | 19.000000 | 390.955000 | 11.570000 |
| 75% | 3.731923 | 12.500000 | 18.100000 | 0.000000 | 0.631000 | 6.630250 | 94.100000 | 5.100400 | 24.000000 | 666.000000 | 20.200000 | 395.630000 | 17.102500 |
| max | 73.534100 | 100.000000 | 27.740000 | 1.000000 | 0.871000 | 8.780000 | 100.000000 | 12.126500 | 24.000000 | 711.000000 | 22.000000 | 396.900000 | 36.980000 |
It has primarily three types of objescts:
Primarily, there are two types of features scaling mehods:
Min-Max (Normalization) (value - min)/(max - min) sklearn provides a class called MinMaxScaler
Standardization (value-mean)/std std means standard deviation sklearn provides a class called Standard Scaler
It is used for a series of atomation.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
('imputer',SimpleImputer(strategy="median")),
#.... add as many pipelines as needed
('std_scalar',StandardScaler()),
])housing_num_tr = my_pipeline.fit_transform(housing_tr)housing_num_tr #returns numpy arrayarray([[-0.43942006, 3.12628155, -1.12165014, ..., -0.97491834,
0.41164221, -0.86091034],
[-0.44352175, 3.12628155, -1.35893781, ..., -0.69277865,
0.39131918, -0.94116739],
[ 0.15682292, -0.4898311 , 0.98336806, ..., 0.81196637,
0.44624347, 0.81480158],
...,
[-0.43525657, -0.4898311 , -1.23083158, ..., -0.22254583,
0.41831233, -1.27603303],
[ 0.14210728, -0.4898311 , 0.98336806, ..., 0.81196637,
-3.15239177, 0.73869575],
[-0.43974024, -0.4898311 , 0.37049623, ..., -0.97491834,
0.41070422, 0.09940681]])
housing_num_tr.shape(404, 13)
#from sklearn.linear_model import LinearRegression
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor # choosen model
#model = LinearRegression() Discarded model due to heavy error of 22.8
#model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
some_data = housing.iloc[:5]some_labels = housing_labels.iloc[:5]prepared_data = my_pipeline.transform(some_data)model.predict(prepared_data) #Gives pedicted values in the form of an arrayarray([22.317, 25.696, 16.502, 23.511, 23.583])
list(some_labels) #These are the expected values[21.9, 24.5, 16.7, 23.1, 23.0]
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
#lin_mse = mean_squared_error(housing_labels, housing_predictions) linear mean squared error
#lin_rmse = np.sqrt(lin_mse) root mean squared error
mse = mean_squared_error(housing_labels, housing_predictions) # mean squared error
rmse = np.sqrt(mse) # root mean squared errorrmse #The error is 0. It meams, this model has learned the dataset along with factors like noise. It is called overfitting. Our model needs to follow trend and not noise. Hence, we need to avoid the condition of overfitting or underfitting.1.3030784295522488
1,2,3,4,5,6,7,8,9,10 We will do stepwise testing for errors. Like, in first round, we will eliminate 1, and test rest of the cases, than eliminate 1,2 and test rest of the cases and so on...
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)rmse_scoresarray([2.76236684, 2.72915525, 4.43665258, 2.73340914, 3.40079296,
2.61371069, 5.15244869, 3.28961837, 3.02669383, 3.1373396 ])
def print_scores(scores):
print("Scores: ",scores)
print("Mean: ",scores.mean())
print("Standard Deviation: ",scores.std())print_scores(rmse_scores) #best model = RandomForestRegressorScores: [2.76236684 2.72915525 4.43665258 2.73340914 3.40079296 2.61371069
5.15244869 3.28961837 3.02669383 3.1373396 ]
Mean: 3.3282187957275076
Standard Deviation: 0.789552738177566
from joblib import dump,load
dump(model,"DRL.joblib")['DRL.joblib']
X_test = strat_test_set.drop("MEDV", axis=1) #Dropping label
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)final_rmse #The lesser the rmse, the better is the model. But it should not be 0.3.0763381131264658