6e0177dcef3a4ff195a5cf3b19b4c88c

import pandas as pd

housing = pd.read_csv("data.csv")

housing.head()  #return top 5 rows

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

housing.info() #gives information about each column in csv

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB

housing['CHAS'].value_counts() # return each type of value in selected column along with its counts

0    471
1     35
Name: CHAS, dtype: int64

housing['NOX'].value_counts()

0.538    23
0.713    18
0.437    17
0.871    16
0.624    15
         ..
0.394     1
0.518     1
0.385     1
0.389     1
0.435     1
Name: NOX, Length: 81, dtype: int64

housing.describe() #returns 1. count (ignores null values) 2. Mean 3. Standard Deviation 4. Min and Max values 5. How much percentage of values are less than represented value

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
count	506.000000	506.000000	506.000000	506.000000	506.000000	501.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.285850	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.701639	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885000	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.202000	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677083	12.500000	18.100000	0.000000	0.624000	6.625000	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

%matplotlib inline 
#Shows graphs at present i.e. during execution

import matplotlib.pyplot as plt

housing.hist(bins = 50, figsize=(20,15))

array([[<AxesSubplot:title={'center':'CRIM'}>,
        <AxesSubplot:title={'center':'ZN'}>,
        <AxesSubplot:title={'center':'INDUS'}>,
        <AxesSubplot:title={'center':'CHAS'}>],
       [<AxesSubplot:title={'center':'NOX'}>,
        <AxesSubplot:title={'center':'RM'}>,
        <AxesSubplot:title={'center':'AGE'}>,
        <AxesSubplot:title={'center':'DIS'}>],
       [<AxesSubplot:title={'center':'RAD'}>,
        <AxesSubplot:title={'center':'TAX'}>,
        <AxesSubplot:title={'center':'PTRATIO'}>,
        <AxesSubplot:title={'center':'B'}>],
       [<AxesSubplot:title={'center':'LSTAT'}>,
        <AxesSubplot:title={'center':'MEDV'}>, <AxesSubplot:>,
        <AxesSubplot:>]], dtype=object)

import numpy as np

#for learning purpose as it is available in scikit learn
def split_train_test(data,test_ratio): #test ratio is the part of data that needs to be preserved for testing and is not involved in training
    np.random.seed(42) #It fixes the random shuffled values thus preventing overfitting i.e. leaking test data into training data
    shuffled=np.random.permutation(len(data)) #length of the data is permutated randomly shuffled with indices
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

#train_set, test_set=split_train_test(housing,0.2)

#print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")

Rows in train set: 404
Rows in test set: 102

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in  split.split(housing,housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

strat_test_set

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
342	0.02498	0.0	1.89	0	0.518	6.540	59.7	6.2669	1	422	15.9	389.96	8.65	16.5
379	17.86670	0.0	18.10	0	0.671	6.223	100.0	1.3861	24	666	20.2	393.74	21.78	10.2
223	0.61470	0.0	6.20	0	0.507	6.618	80.8	3.2721	8	307	17.4	396.90	7.60	30.1
219	0.11425	0.0	13.89	1	0.550	6.373	92.4	3.3633	5	276	16.4	393.74	10.50	23.0
48	0.25387	0.0	6.91	0	0.448	5.399	95.3	5.8700	3	233	17.9	396.90	30.81	14.4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
88	0.05660	0.0	3.41	0	0.489	7.007	86.3	3.4217	2	270	17.8	396.90	5.50	23.6
466	3.77498	0.0	18.10	0	0.655	5.952	84.7	2.8715	24	666	20.2	22.01	17.15	19.0
52	0.05360	21.0	5.64	0	0.439	6.511	21.1	6.8147	4	243	16.8	396.90	5.28	25.0
121	0.07165	0.0	25.65	0	0.581	6.004	84.1	2.1974	2	188	19.1	377.67	14.27	20.3
218	0.11069	0.0	13.89	1	0.550	5.951	93.8	2.8893	5	276	16.4	396.90	17.92	21.5

102 rows × 14 columns

strat_test_set['CHAS'].value_counts()

0    95
1     7
Name: CHAS, dtype: int64

strat_train_set['CHAS'].value_counts()

0    376
1     28
Name: CHAS, dtype: int64

housing = strat_train_set.copy() #Copying the training data to housing attribute

corr_matrix = housing.corr()

corr_matrix['MEDV'].sort_values(ascending = False)

MEDV       1.000000
RM         0.683023
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

from pandas.plotting import scatter_matrix
attributes = ["MEDV","RM","ZN","LSTAT"]
scatter_matrix(housing[attributes], figsize=(12,8))

array([[<AxesSubplot:xlabel='MEDV', ylabel='MEDV'>,
        <AxesSubplot:xlabel='RM', ylabel='MEDV'>,
        <AxesSubplot:xlabel='ZN', ylabel='MEDV'>,
        <AxesSubplot:xlabel='LSTAT', ylabel='MEDV'>],
       [<AxesSubplot:xlabel='MEDV', ylabel='RM'>,
        <AxesSubplot:xlabel='RM', ylabel='RM'>,
        <AxesSubplot:xlabel='ZN', ylabel='RM'>,
        <AxesSubplot:xlabel='LSTAT', ylabel='RM'>],
       [<AxesSubplot:xlabel='MEDV', ylabel='ZN'>,
        <AxesSubplot:xlabel='RM', ylabel='ZN'>,
        <AxesSubplot:xlabel='ZN', ylabel='ZN'>,
        <AxesSubplot:xlabel='LSTAT', ylabel='ZN'>],
       [<AxesSubplot:xlabel='MEDV', ylabel='LSTAT'>,
        <AxesSubplot:xlabel='RM', ylabel='LSTAT'>,
        <AxesSubplot:xlabel='ZN', ylabel='LSTAT'>,
        <AxesSubplot:xlabel='LSTAT', ylabel='LSTAT'>]], dtype=object)

housing.plot(kind="scatter",x="RM",y="MEDV",alpha=0.8)

<AxesSubplot:xlabel='RM', ylabel='MEDV'>

housing["TAXRM"] = housing["TAX"]/housing["RM"]

housing["TAXRM"]

254     51.571709
348     42.200452
476    102.714374
321     45.012547
326     45.468948
          ...    
155     65.507152
423    109.126659
98      35.294118
455    102.068966
216     46.875000
Name: TAXRM, Length: 404, dtype: float64

housing.head()

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV	TAXRM
254	0.04819	80.0	3.64	0.392	6.108	32.0	9.2203	1	315	16.4	392.89	6.57	21.9	51.571709
348	0.01501	80.0	2.01	0.435	6.635	29.7	8.3440	4	280	17.0	390.94	5.99	24.5	42.200452
476	4.87141	0.0	18.10	0.614	6.484	93.6	2.3053	24	666	20.2	396.21	18.68	16.7	102.714374
321	0.18159	0.0	7.38	0.493	6.376	54.3	4.5404	5	287	19.6	396.90	6.87	23.1	45.012547
326	0.30347	0.0	7.38	0.493	6.312	28.9	5.4159	5	287	19.6	396.90	6.15	23.0	45.468948

corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending = False)

MEDV       1.000000
RM         0.683023
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
TAXRM     -0.528591
LSTAT     -0.740494
Name: MEDV, dtype: float64

housing.plot(kind="scatter",x="TAXRM",y="MEDV",alpha=0.8)

<AxesSubplot:xlabel='TAXRM', ylabel='MEDV'>

housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

1. Get rid of missing data points.

2. Get rid of whole attribute.

3. To replace missing values with 0, Mean or Median, whichever suites the most.

#option 1
housing.dropna(subset=["RM"]).shape
# Note: No data in original dataframe is harmed until True attribute is not given.

(400, 13)

#option 2
housing.drop("RM",axis=1).shape
# Note: No data in original dataframe is harmed.

(404, 12)

median = housing["RM"].median()
housing["RM"].fillna(median)
# Note: No data in original dataframe is harmed.

254    6.108
348    6.635
476    6.484
321    6.376
326    6.312
       ...  
155    6.152
423    6.103
98     7.820
455    6.525
216    5.888
Name: RM, Length: 404, dtype: float64

housing.describe() #Before imputing/filling missing attributes.

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
count	404.000000	404.000000	404.000000	404.000000	404.000000	400.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000
mean	3.602814	10.836634	11.344950	0.069307	0.558064	6.282808	69.039851	3.746210	9.735149	412.341584	18.473267	353.392822	12.791609
std	8.099383	22.150636	6.877817	0.254290	0.116875	0.711595	28.258248	2.099057	8.731259	168.672623	2.129243	96.069235	7.235740
min	0.006320	0.000000	0.740000	0.000000	0.389000	3.561000	2.900000	1.129600	1.000000	187.000000	13.000000	0.320000	1.730000
25%	0.086962	0.000000	5.190000	0.000000	0.453000	5.878750	44.850000	2.035975	4.000000	284.000000	17.400000	374.617500	6.847500
50%	0.286735	0.000000	9.900000	0.000000	0.538000	6.209000	78.200000	3.122200	5.000000	337.000000	19.000000	390.955000	11.570000
75%	3.731923	12.500000	18.100000	0.000000	0.631000	6.632000	94.100000	5.100400	24.000000	666.000000	20.200000	395.630000	17.102500
max	73.534100	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	36.980000

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

SimpleImputer(strategy='median')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

imputer.statistics_ #Shows each column median

array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
       6.20900e+00, 7.82000e+01, 3.12220e+00, 5.00000e+00, 3.37000e+02,
       1.90000e+01, 3.90955e+02, 1.15700e+01])

X = imputer.transform(housing)

housing_tr = pd.DataFrame(X, columns=housing.columns) # It is a transformed dataframe without missing values

housing_tr.describe()

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
count	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000	404.000000
mean	3.602814	10.836634	11.344950	0.069307	0.558064	6.282077	69.039851	3.746210	9.735149	412.341584	18.473267	353.392822	12.791609
std	8.099383	22.150636	6.877817	0.254290	0.116875	0.708093	28.258248	2.099057	8.731259	168.672623	2.129243	96.069235	7.235740
min	0.006320	0.000000	0.740000	0.000000	0.389000	3.561000	2.900000	1.129600	1.000000	187.000000	13.000000	0.320000	1.730000
25%	0.086962	0.000000	5.190000	0.000000	0.453000	5.879750	44.850000	2.035975	4.000000	284.000000	17.400000	374.617500	6.847500
50%	0.286735	0.000000	9.900000	0.000000	0.538000	6.209000	78.200000	3.122200	5.000000	337.000000	19.000000	390.955000	11.570000
75%	3.731923	12.500000	18.100000	0.000000	0.631000	6.630250	94.100000	5.100400	24.000000	666.000000	20.200000	395.630000	17.102500
max	73.534100	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	36.980000

It has primarily three types of objescts:

Estimator - It estimates some parameters based on dataset. Eg. imputer. It has a fit method and transform method. Fit Method- fits the dataset and calculates internal parameters.
Transformers - transform method takes input and returns output based on the learning from fit(). It also has a convinience function fit_transform() which fits and then transform.
Predictors - Linear Regression model is an example of predictor. fit() and predict() are two common functions in it. It also have score() function which will evaluate the predictons.

Primarily, there are two types of features scaling mehods:

Min-Max (Normalization) (value - min)/(max - min) sklearn provides a class called MinMaxScaler
Standardization (value-mean)/std std means standard deviation sklearn provides a class called Standard Scaler

It is used for a series of atomation.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy="median")),
      #.... add as many pipelines as needed
    ('std_scalar',StandardScaler()), 
])

housing_num_tr = my_pipeline.fit_transform(housing_tr)

housing_num_tr   #returns numpy array

array([[-0.43942006,  3.12628155, -1.12165014, ..., -0.97491834,
         0.41164221, -0.86091034],
       [-0.44352175,  3.12628155, -1.35893781, ..., -0.69277865,
         0.39131918, -0.94116739],
       [ 0.15682292, -0.4898311 ,  0.98336806, ...,  0.81196637,
         0.44624347,  0.81480158],
       ...,
       [-0.43525657, -0.4898311 , -1.23083158, ..., -0.22254583,
         0.41831233, -1.27603303],
       [ 0.14210728, -0.4898311 ,  0.98336806, ...,  0.81196637,
        -3.15239177,  0.73869575],
       [-0.43974024, -0.4898311 ,  0.37049623, ..., -0.97491834,
         0.41070422,  0.09940681]])

housing_num_tr.shape

(404, 13)

#from sklearn.linear_model import LinearRegression
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor # choosen model
#model = LinearRegression() Discarded model due to heavy error of 22.8
#model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

RandomForestRegressor()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

some_data = housing.iloc[:5]

some_labels = housing_labels.iloc[:5]

prepared_data = my_pipeline.transform(some_data)

model.predict(prepared_data) #Gives pedicted values in the form of an array

array([22.317, 25.696, 16.502, 23.511, 23.583])

list(some_labels) #These are the expected values

[21.9, 24.5, 16.7, 23.1, 23.0]

from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
#lin_mse = mean_squared_error(housing_labels, housing_predictions) linear mean squared error
#lin_rmse = np.sqrt(lin_mse) root mean squared error
mse = mean_squared_error(housing_labels, housing_predictions) # mean squared error
rmse = np.sqrt(mse) # root mean squared error

rmse #The error is 0. It meams, this model has learned the dataset along with factors like noise. It is called overfitting. Our model needs to follow trend and not noise. Hence, we need to avoid the condition of overfitting or underfitting.

1.3030784295522488

1,2,3,4,5,6,7,8,9,10 We will do stepwise testing for errors. Like, in first round, we will eliminate 1, and test rest of the cases, than eliminate 1,2 and test rest of the cases and so on...

from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

rmse_scores

array([2.76236684, 2.72915525, 4.43665258, 2.73340914, 3.40079296,
       2.61371069, 5.15244869, 3.28961837, 3.02669383, 3.1373396 ])

def print_scores(scores):
    print("Scores: ",scores)
    print("Mean: ",scores.mean())
    print("Standard Deviation: ",scores.std())

print_scores(rmse_scores) #best model = RandomForestRegressor

Scores:  [2.76236684 2.72915525 4.43665258 2.73340914 3.40079296 2.61371069
 5.15244869 3.28961837 3.02669383 3.1373396 ]
Mean:  3.3282187957275076
Standard Deviation:  0.789552738177566

from joblib import dump,load
dump(model,"DRL.joblib")

['DRL.joblib']

X_test = strat_test_set.drop("MEDV", axis=1) #Dropping label
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

final_rmse #The lesser the rmse, the better is the model. But it should not be 0.

3.0763381131264658

Dragon Real State -- Price Prediction

Training data & testing data splitting

It is done to saparate training data with test data. As doing all activities and performing algorithms directly to data can harm our predictions in real life scenarios.

Stratified sampling is necessary as it covers all kind of data in our population. In aboove method, stratified sampling is not guranteed. For ex. CHAS has 471 "0" values and 35 "1" values. It is important to make sure training data shows both 0 and 1 values to the algorithm.

Note: Pearson correlation value lies between -1 and 1. Where -1 means "strong negative correlation" which means inversely propotional, 1 means "strong postive correlation" which means directly proptional.

Looking for correlation:

There are no straight lines observed digonally, because it would not be much informative, hence histograms are drawn. As we have observed, LSTAT have negative correlation (-0.737663) with MEDV therefore MEDV is inversely proptional to LSTAT and similar graphs.

Graph between RM and MEDV:

We can elimate the outlier points to get more accurate results in our predictions. This is the benifit of finding correlation matrix.

Trying out new attribute combinations to get better coorelations

Handling missing Values

To take care of missing values, we have three options:

1. Get rid of missing data points.

2. Get rid of whole attribute.

3. To replace missing values with 0, Mean or Median, whichever suites the most.

sklearn has built in imputer to deal with missing values

It calculates median for each column

Scikit-learn design

Feature Scaling

Creating pipeline

Selecting a desired model

Evaluation of the model

Better evalutaion - (cross validation)

Saving the model

Testing on test data set