Dragon Real State -- Price Prediction

import pandas as pd
housing = pd.read_csv("data.csv")
housing.head()  #return top 5 rows
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2
housing.info() #gives information about each column in csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
housing['CHAS'].value_counts() # return each type of value in selected column along with its counts
0    471
1     35
Name: CHAS, dtype: int64
housing['NOX'].value_counts()
0.538    23
0.713    18
0.437    17
0.871    16
0.624    15
         ..
0.394     1
0.518     1
0.385     1
0.389     1
0.435     1
Name: NOX, Length: 81, dtype: int64
housing.describe() #returns 1. count (ignores null values) 2. Mean 3. Standard Deviation 4. Min and Max values 5. How much percentage of values are less than represented value
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
count 506.000000 506.000000 506.000000 506.000000 506.000000 501.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.285850 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 8.601545 23.322453 6.860353 0.253994 0.115878 0.701639 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885000 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.202000 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 3.677083 12.500000 18.100000 0.000000 0.624000 6.625000 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000
%matplotlib inline 
#Shows graphs at present i.e. during execution
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize=(20,15))
array([[<AxesSubplot:title={'center':'CRIM'}>,
        <AxesSubplot:title={'center':'ZN'}>,
        <AxesSubplot:title={'center':'INDUS'}>,
        <AxesSubplot:title={'center':'CHAS'}>],
       [<AxesSubplot:title={'center':'NOX'}>,
        <AxesSubplot:title={'center':'RM'}>,
        <AxesSubplot:title={'center':'AGE'}>,
        <AxesSubplot:title={'center':'DIS'}>],
       [<AxesSubplot:title={'center':'RAD'}>,
        <AxesSubplot:title={'center':'TAX'}>,
        <AxesSubplot:title={'center':'PTRATIO'}>,
        <AxesSubplot:title={'center':'B'}>],
       [<AxesSubplot:title={'center':'LSTAT'}>,
        <AxesSubplot:title={'center':'MEDV'}>, <AxesSubplot:>,
        <AxesSubplot:>]], dtype=object)

Training data & testing data splitting

It is done to saparate training data with test data. As doing all activities and performing algorithms directly to data can harm our predictions in real life scenarios.

import numpy as np
#for learning purpose as it is available in scikit learn
def split_train_test(data,test_ratio): #test ratio is the part of data that needs to be preserved for testing and is not involved in training
    np.random.seed(42) #It fixes the random shuffled values thus preventing overfitting i.e. leaking test data into training data
    shuffled=np.random.permutation(len(data)) #length of the data is permutated randomly shuffled with indices
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]
#train_set, test_set=split_train_test(housing,0.2)
#print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")
Rows in train set: 404
Rows in test set: 102

Stratified sampling is necessary as it covers all kind of data in our population. In aboove method, stratified sampling is not guranteed. For ex. CHAS has 471 "0" values and 35 "1" values. It is important to make sure training data shows both 0 and 1 values to the algorithm.

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in  split.split(housing,housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
strat_test_set
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
342 0.02498 0.0 1.89 0 0.518 6.540 59.7 6.2669 1 422 15.9 389.96 8.65 16.5
379 17.86670 0.0 18.10 0 0.671 6.223 100.0 1.3861 24 666 20.2 393.74 21.78 10.2
223 0.61470 0.0 6.20 0 0.507 6.618 80.8 3.2721 8 307 17.4 396.90 7.60 30.1
219 0.11425 0.0 13.89 1 0.550 6.373 92.4 3.3633 5 276 16.4 393.74 10.50 23.0
48 0.25387 0.0 6.91 0 0.448 5.399 95.3 5.8700 3 233 17.9 396.90 30.81 14.4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
88 0.05660 0.0 3.41 0 0.489 7.007 86.3 3.4217 2 270 17.8 396.90 5.50 23.6
466 3.77498 0.0 18.10 0 0.655 5.952 84.7 2.8715 24 666 20.2 22.01 17.15 19.0
52 0.05360 21.0 5.64 0 0.439 6.511 21.1 6.8147 4 243 16.8 396.90 5.28 25.0
121 0.07165 0.0 25.65 0 0.581 6.004 84.1 2.1974 2 188 19.1 377.67 14.27 20.3
218 0.11069 0.0 13.89 1 0.550 5.951 93.8 2.8893 5 276 16.4 396.90 17.92 21.5

102 rows × 14 columns

strat_test_set['CHAS'].value_counts()
0    95
1     7
Name: CHAS, dtype: int64
strat_train_set['CHAS'].value_counts()
0    376
1     28
Name: CHAS, dtype: int64
housing = strat_train_set.copy() #Copying the training data to housing attribute 

Note: Pearson correlation value lies between -1 and 1. Where -1 means "strong negative correlation" which means inversely propotional, 1 means "strong postive correlation" which means directly proptional.

Looking for correlation:

corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending = False)
MEDV       1.000000
RM         0.683023
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64
from pandas.plotting import scatter_matrix
attributes = ["MEDV","RM","ZN","LSTAT"]
scatter_matrix(housing[attributes], figsize=(12,8))
array([[<AxesSubplot:xlabel='MEDV', ylabel='MEDV'>,
        <AxesSubplot:xlabel='RM', ylabel='MEDV'>,
        <AxesSubplot:xlabel='ZN', ylabel='MEDV'>,
        <AxesSubplot:xlabel='LSTAT', ylabel='MEDV'>],
       [<AxesSubplot:xlabel='MEDV', ylabel='RM'>,
        <AxesSubplot:xlabel='RM', ylabel='RM'>,
        <AxesSubplot:xlabel='ZN', ylabel='RM'>,
        <AxesSubplot:xlabel='LSTAT', ylabel='RM'>],
       [<AxesSubplot:xlabel='MEDV', ylabel='ZN'>,
        <AxesSubplot:xlabel='RM', ylabel='ZN'>,
        <AxesSubplot:xlabel='ZN', ylabel='ZN'>,
        <AxesSubplot:xlabel='LSTAT', ylabel='ZN'>],
       [<AxesSubplot:xlabel='MEDV', ylabel='LSTAT'>,
        <AxesSubplot:xlabel='RM', ylabel='LSTAT'>,
        <AxesSubplot:xlabel='ZN', ylabel='LSTAT'>,
        <AxesSubplot:xlabel='LSTAT', ylabel='LSTAT'>]], dtype=object)

There are no straight lines observed digonally, because it would not be much informative, hence histograms are drawn. As we have observed, LSTAT have negative correlation (-0.737663) with MEDV therefore MEDV is inversely proptional to LSTAT and similar graphs.

Graph between RM and MEDV:

housing.plot(kind="scatter",x="RM",y="MEDV",alpha=0.8)
<AxesSubplot:xlabel='RM', ylabel='MEDV'>

We can elimate the outlier points to get more accurate results in our predictions. This is the benifit of finding correlation matrix.

Trying out new attribute combinations to get better coorelations

housing["TAXRM"] = housing["TAX"]/housing["RM"]
housing["TAXRM"]
254     51.571709
348     42.200452
476    102.714374
321     45.012547
326     45.468948
          ...    
155     65.507152
423    109.126659
98      35.294118
455    102.068966
216     46.875000
Name: TAXRM, Length: 404, dtype: float64
housing.head()
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV TAXRM
254 0.04819 80.0 3.64 0 0.392 6.108 32.0 9.2203 1 315 16.4 392.89 6.57 21.9 51.571709
348 0.01501 80.0 2.01 0 0.435 6.635 29.7 8.3440 4 280 17.0 390.94 5.99 24.5 42.200452
476 4.87141 0.0 18.10 0 0.614 6.484 93.6 2.3053 24 666 20.2 396.21 18.68 16.7 102.714374
321 0.18159 0.0 7.38 0 0.493 6.376 54.3 4.5404 5 287 19.6 396.90 6.87 23.1 45.012547
326 0.30347 0.0 7.38 0 0.493 6.312 28.9 5.4159 5 287 19.6 396.90 6.15 23.0 45.468948
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending = False)
MEDV       1.000000
RM         0.683023
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
TAXRM     -0.528591
LSTAT     -0.740494
Name: MEDV, dtype: float64
housing.plot(kind="scatter",x="TAXRM",y="MEDV",alpha=0.8)
<AxesSubplot:xlabel='TAXRM', ylabel='MEDV'>

housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

Handling missing Values

To take care of missing values, we have three options:

1. Get rid of missing data points.

2. Get rid of whole attribute.

3. To replace missing values with 0, Mean or Median, whichever suites the most.

#option 1
housing.dropna(subset=["RM"]).shape
# Note: No data in original dataframe is harmed until True attribute is not given.
(400, 13)
#option 2
housing.drop("RM",axis=1).shape
# Note: No data in original dataframe is harmed.
(404, 12)
median = housing["RM"].median()
housing["RM"].fillna(median)
# Note: No data in original dataframe is harmed.
254    6.108
348    6.635
476    6.484
321    6.376
326    6.312
       ...  
155    6.152
423    6.103
98     7.820
455    6.525
216    5.888
Name: RM, Length: 404, dtype: float64
housing.describe() #Before imputing/filling missing attributes.
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
count 404.000000 404.000000 404.000000 404.000000 404.000000 400.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000
mean 3.602814 10.836634 11.344950 0.069307 0.558064 6.282808 69.039851 3.746210 9.735149 412.341584 18.473267 353.392822 12.791609
std 8.099383 22.150636 6.877817 0.254290 0.116875 0.711595 28.258248 2.099057 8.731259 168.672623 2.129243 96.069235 7.235740
min 0.006320 0.000000 0.740000 0.000000 0.389000 3.561000 2.900000 1.129600 1.000000 187.000000 13.000000 0.320000 1.730000
25% 0.086962 0.000000 5.190000 0.000000 0.453000 5.878750 44.850000 2.035975 4.000000 284.000000 17.400000 374.617500 6.847500
50% 0.286735 0.000000 9.900000 0.000000 0.538000 6.209000 78.200000 3.122200 5.000000 337.000000 19.000000 390.955000 11.570000
75% 3.731923 12.500000 18.100000 0.000000 0.631000 6.632000 94.100000 5.100400 24.000000 666.000000 20.200000 395.630000 17.102500
max 73.534100 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 36.980000

sklearn has built in imputer to deal with missing values

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)
SimpleImputer(strategy='median')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

It calculates median for each column

imputer.statistics_ #Shows each column median
array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
       6.20900e+00, 7.82000e+01, 3.12220e+00, 5.00000e+00, 3.37000e+02,
       1.90000e+01, 3.90955e+02, 1.15700e+01])
X = imputer.transform(housing)
housing_tr = pd.DataFrame(X, columns=housing.columns) # It is a transformed dataframe without missing values
housing_tr.describe()
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
count 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000 404.000000
mean 3.602814 10.836634 11.344950 0.069307 0.558064 6.282077 69.039851 3.746210 9.735149 412.341584 18.473267 353.392822 12.791609
std 8.099383 22.150636 6.877817 0.254290 0.116875 0.708093 28.258248 2.099057 8.731259 168.672623 2.129243 96.069235 7.235740
min 0.006320 0.000000 0.740000 0.000000 0.389000 3.561000 2.900000 1.129600 1.000000 187.000000 13.000000 0.320000 1.730000
25% 0.086962 0.000000 5.190000 0.000000 0.453000 5.879750 44.850000 2.035975 4.000000 284.000000 17.400000 374.617500 6.847500
50% 0.286735 0.000000 9.900000 0.000000 0.538000 6.209000 78.200000 3.122200 5.000000 337.000000 19.000000 390.955000 11.570000
75% 3.731923 12.500000 18.100000 0.000000 0.631000 6.630250 94.100000 5.100400 24.000000 666.000000 20.200000 395.630000 17.102500
max 73.534100 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 36.980000

Scikit-learn design

It has primarily three types of objescts:

  1. Estimator - It estimates some parameters based on dataset. Eg. imputer. It has a fit method and transform method. Fit Method- fits the dataset and calculates internal parameters.
  2. Transformers - transform method takes input and returns output based on the learning from fit(). It also has a convinience function fit_transform() which fits and then transform.
  3. Predictors - Linear Regression model is an example of predictor. fit() and predict() are two common functions in it. It also have score() function which will evaluate the predictons.

Feature Scaling

Primarily, there are two types of features scaling mehods:

  1. Min-Max (Normalization) (value - min)/(max - min) sklearn provides a class called MinMaxScaler

  2. Standardization (value-mean)/std std means standard deviation sklearn provides a class called Standard Scaler

Creating pipeline

It is used for a series of atomation.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy="median")),
      #.... add as many pipelines as needed
    ('std_scalar',StandardScaler()), 
])
housing_num_tr = my_pipeline.fit_transform(housing_tr)
housing_num_tr   #returns numpy array
array([[-0.43942006,  3.12628155, -1.12165014, ..., -0.97491834,
         0.41164221, -0.86091034],
       [-0.44352175,  3.12628155, -1.35893781, ..., -0.69277865,
         0.39131918, -0.94116739],
       [ 0.15682292, -0.4898311 ,  0.98336806, ...,  0.81196637,
         0.44624347,  0.81480158],
       ...,
       [-0.43525657, -0.4898311 , -1.23083158, ..., -0.22254583,
         0.41831233, -1.27603303],
       [ 0.14210728, -0.4898311 ,  0.98336806, ...,  0.81196637,
        -3.15239177,  0.73869575],
       [-0.43974024, -0.4898311 ,  0.37049623, ..., -0.97491834,
         0.41070422,  0.09940681]])
housing_num_tr.shape
(404, 13)

Selecting a desired model

#from sklearn.linear_model import LinearRegression
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor # choosen model
#model = LinearRegression() Discarded model due to heavy error of 22.8
#model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
prepared_data = my_pipeline.transform(some_data)
model.predict(prepared_data) #Gives pedicted values in the form of an array
array([22.317, 25.696, 16.502, 23.511, 23.583])
list(some_labels) #These are the expected values
[21.9, 24.5, 16.7, 23.1, 23.0]

Evaluation of the model

from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
#lin_mse = mean_squared_error(housing_labels, housing_predictions) linear mean squared error
#lin_rmse = np.sqrt(lin_mse) root mean squared error
mse = mean_squared_error(housing_labels, housing_predictions) # mean squared error
rmse = np.sqrt(mse) # root mean squared error
rmse #The error is 0. It meams, this model has learned the dataset along with factors like noise. It is called overfitting. Our model needs to follow trend and not noise. Hence, we need to avoid the condition of overfitting or underfitting.
1.3030784295522488

Better evalutaion - (cross validation)

1,2,3,4,5,6,7,8,9,10 We will do stepwise testing for errors. Like, in first round, we will eliminate 1, and test rest of the cases, than eliminate 1,2 and test rest of the cases and so on...

from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores
array([2.76236684, 2.72915525, 4.43665258, 2.73340914, 3.40079296,
       2.61371069, 5.15244869, 3.28961837, 3.02669383, 3.1373396 ])
def print_scores(scores):
    print("Scores: ",scores)
    print("Mean: ",scores.mean())
    print("Standard Deviation: ",scores.std())
print_scores(rmse_scores) #best model = RandomForestRegressor
Scores:  [2.76236684 2.72915525 4.43665258 2.73340914 3.40079296 2.61371069
 5.15244869 3.28961837 3.02669383 3.1373396 ]
Mean:  3.3282187957275076
Standard Deviation:  0.789552738177566

Saving the model

from joblib import dump,load
dump(model,"DRL.joblib")
['DRL.joblib']

Testing on test data set

X_test = strat_test_set.drop("MEDV", axis=1) #Dropping label
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse #The lesser the rmse, the better is the model. But it should not be 0.
3.0763381131264658