Boston House — Price Predictor

import pandas as pdhousing = pd.read_csv("C:\\Users\\Arslan Mushtaq\\Downloads\\archive(1)\\data.csv")housing.head()
png
housing.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CRIM 511 non-null float64
1 ZN 511 non-null float64
2 INDUS 511 non-null float64
3 CHAS 511 non-null int64
4 NOX 511 non-null float64
5 RM 506 non-null float64
6 AGE 511 non-null float64
7 DIS 511 non-null float64
8 RAD 511 non-null int64
9 TAX 511 non-null int64
10 PTRATIO 511 non-null float64
11 B 511 non-null float64
12 LSTAT 511 non-null float64
13 MEDV 511 non-null float64
dtypes: float64(11), int64(3)
memory usage: 56.0 KB
housing['CHAS'].value_counts()0 476
1 35
Name: CHAS, dtype: int64
housing.describe()
png
%matplotlib inline# # For plotting histogram
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))
array([[<AxesSubplot:title={'center':'AGE'}>,
<AxesSubplot:title={'center':'B'}>,
<AxesSubplot:title={'center':'CHAS'}>,
<AxesSubplot:title={'center':'CRIM'}>],
[<AxesSubplot:title={'center':'DIS'}>,
<AxesSubplot:title={'center':'INDUS'}>,
<AxesSubplot:title={'center':'LSTAT'}>,
<AxesSubplot:title={'center':'MEDV'}>],
[<AxesSubplot:title={'center':'NOX'}>,
<AxesSubplot:title={'center':'PTRATIO'}>,
<AxesSubplot:title={'center':'RAD'}>,
<AxesSubplot:title={'center':'RM'}>],
[<AxesSubplot:title={'center':'TAX'}>,
<AxesSubplot:title={'center':'ZN'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)
png

Train-Test Splitting

# For learning purpose
import numpy as np
def split_train_test(data, test_ratio):
np.random.seed(42)
shuffled = np.random.permutation(len(data))
print(shuffled)
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled[:test_set_size]
train_indices = shuffled[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]

2nd Method for Training And Testing

# train_set, test_set = split_train_test(housing, 0.2)# print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")
Rows in train set: 408
Rows in test set: 103
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
strat_test_set['CHAS'].value_counts()0 96
1 7
Name: CHAS, dtype: int64
strat_train_set['CHAS'].value_counts()0 380
1 28
Name: CHAS, dtype: int64
95/713.571428571428571376/2813.428571428571429housing = strat_train_set.copy()

Looking for Correlations

corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)
MEDV 1.000000
RM 0.681481
B 0.350283
ZN 0.341767
DIS 0.250670
CHAS 0.191563
AGE -0.366230
RAD -0.381955
CRIM -0.395849
NOX -0.425104
TAX -0.467171
PTRATIO -0.475870
INDUS -0.481414
LSTAT -0.720350
Name: MEDV, dtype: float64
from pandas.plotting import scatter_matrix
attributes = ["MEDV", "RM", "ZN", "LSTAT"]
scatter_matrix(housing[attributes], figsize = (12,8))
array([[<AxesSubplot:xlabel='MEDV', ylabel='MEDV'>,
<AxesSubplot:xlabel='RM', ylabel='MEDV'>,
<AxesSubplot:xlabel='ZN', ylabel='MEDV'>,
<AxesSubplot:xlabel='LSTAT', ylabel='MEDV'>],
[<AxesSubplot:xlabel='MEDV', ylabel='RM'>,
<AxesSubplot:xlabel='RM', ylabel='RM'>,
<AxesSubplot:xlabel='ZN', ylabel='RM'>,
<AxesSubplot:xlabel='LSTAT', ylabel='RM'>],
[<AxesSubplot:xlabel='MEDV', ylabel='ZN'>,
<AxesSubplot:xlabel='RM', ylabel='ZN'>,
<AxesSubplot:xlabel='ZN', ylabel='ZN'>,
<AxesSubplot:xlabel='LSTAT', ylabel='ZN'>],
[<AxesSubplot:xlabel='MEDV', ylabel='LSTAT'>,
<AxesSubplot:xlabel='RM', ylabel='LSTAT'>,
<AxesSubplot:xlabel='ZN', ylabel='LSTAT'>,
<AxesSubplot:xlabel='LSTAT', ylabel='LSTAT'>]], dtype=object)
png
housing.plot(kind="scatter", x="RM", y="MEDV", alpha=0.8)<AxesSubplot:xlabel='RM', ylabel='MEDV'>
png

Trying out Attribute combinations

housing["TAXRM"] = housing['TAX']/housing['RM']housing.head()
png
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)
MEDV 1.000000
RM 0.681481
B 0.350283
ZN 0.341767
DIS 0.250670
CHAS 0.191563
AGE -0.366230
RAD -0.381955
CRIM -0.395849
NOX -0.425104
TAX -0.467171
PTRATIO -0.475870
INDUS -0.481414
TAXRM -0.535069
LSTAT -0.720350
Name: MEDV, dtype: float64
housing.plot(kind="scatter", x="TAXRM", y="MEDV", alpha=0.8)<AxesSubplot:xlabel='TAXRM', ylabel='MEDV'>
png
housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

Missing Attributes

# To take care of missing attributes, you have three options:
# 1. Get rid of the missing data points
# 2. Get rid of the whole attribute
# 3. Set the value to some value(0, mean or median)
a = housing.dropna(subset=["RM"]) #Option 1
a.shape
# Note that the original housing dataframe will remain unchanged
(403, 13)housing.drop("RM", axis=1).shape # Option 2
# Note that there is no RM column and also note that the original housing dataframe will remain unchanged
(408, 12)median = housing["RM"].median() # Compute median for Option 3housing["RM"].fillna(median) # Option 3
# Note that the original housing dataframe will remain unchanged
495 5.670
254 6.108
365 3.561
193 6.800
35 6.208
...
386 4.652
243 6.393
22 6.142
480 6.242
455 6.525
Name: RM, Length: 408, dtype: float64
housing.shape(408, 13)housing.describe() # before we started filling missing attributes
png
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)
SimpleImputer(strategy='median')imputer.statistics_array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
6.20800e+00, 7.79500e+01, 3.10730e+00, 5.00000e+00, 3.35000e+02,
1.90500e+01, 3.90980e+02, 1.17050e+01])
X = imputer.transform(housing)housing_tr = pd.DataFrame(X, columns=housing.columns)housing_tr.describe()
png

Scikit-learn Design

Primarily, three types of objects

  1. Estimators — It estimates some parameter based on a dataset. Eg. imputer. It has a fit method and transform method. Fit method — Fits the dataset and calculates internal parameters

Feature Scaling

Primarily, two types of feature scaling methods:

  1. Min-max scaling (Normalization) (value — min)/(max — min) Sklearn provides a class called MinMaxScaler for this

Creating a Pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
# ..... add as many as you want in your pipeline
('std_scaler', StandardScaler()),
])
housing_num_tr = my_pipeline.fit_transform(housing)housing_num_tr.shape(408, 13)

Selecting a desired model for Dragon Real Estates

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)
RandomForestRegressor()some_data = housing.iloc[:5]some_labels = housing_labels.iloc[:5]prepared_data = my_pipeline.transform(some_data)model.predict(prepared_data)array([21.748, 22.391, 27.574, 30.62 , 20.548])list(some_labels)[23.1, 21.9, 27.5, 31.1, 18.9]

Evaluating the model

from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)
rmse1.5076935117888486

Using better evaluation technique — Cross Validation

from sklearn.metrics import r2_score# 1 2 3 4 5 6 7 8 9 10
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scoresarray([3.07918078, 3.74030517, 4.90385085, 4.29989591, 3.00696454,
2.27306873, 6.86476013, 2.90492124, 3.38468456, 3.32520813])
def print_scores(scores):
print("Scores:", scores)
print("Mean: ", scores.mean())
print("Standard deviation: ", scores.std())
print_scores(rmse_scores)Scores: [3.07918078 3.74030517 4.90385085 4.29989591 3.00696454 2.27306873
6.86476013 2.90492124 3.38468456 3.32520813]
Mean: 3.778284003174746
Standard deviation: 1.246558053523797

Quiz: Convert this notebook into a python file and run the pipeline using Visual Studio Code

Saving the model

from joblib import dump, load
dump(model, 'Dragon.joblib')
['Dragon.joblib']

Testing the model on test data

X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))
[21.677 27.305 23.505 26.071 33.286 23.538 23.962 20.566 22.833 20.708
10.716 22.184 27.266 20.117 14.888 18.516 18.891 19.56 18.603 30.954
26.097 17.884 31.356 15.755 24.494 18.51 14.758 31.662 19.401 19.927
26.366 19.579 44.863 30.435 21.185 41.317 33.413 22.699 27.043 22.154
20.099 13.894 22.188 29.413 24.3 11.971 8.584 33.796 42.761 20.492
23.544 36.146 15.006 25.768 19.273 29.083 19.15 18.957 24.013 20.661
20.234 33.596 11.051 21.132 22.764 21.56 15.365 20.567 19.611 18.95
30.78 21.264 29.118 20.077 47.323 15.337 20.099 36.954 44.614 16.403
24.902 9.933 44.136 20.735 25.446 15.725 37.658 11.599 31.505 21.01
18.366 43.46 20.664 14.163 6.919 12.478 33.819 29.267 34.242 22.043
17.898 32.772 19.638] [22.4, 25.0, 22.2, 24.4, 31.5, 23.3, 25.0, 24.5, 22.0, 20.3, 12.3, 21.2, 28.0, 23.8, 14.6, 20.8, 17.2, 19.5, 14.4, 28.5, 30.1, 19.6, 23.6, 19.4, 24.7, 19.9, 13.5, 30.3, 21.8, 20.4, 27.9, 18.9, 46.0, 23.9, 21.7, 44.0, 33.2, 21.6, 27.0, 19.8, 20.9, 16.7, 22.2, 24.8, 28.1, 12.7, 7.2, 37.3, 43.1, 23.8, 23.8, 50.0, 15.6, 27.1, 20.1, 29.1, 18.6, 19.5, 20.1, 21.1, 19.7, 33.1, 13.1, 20.4, 23.6, 20.6, 14.1, 22.7, 16.8, 21.5, 32.0, 20.3, 31.0, 19.2, 50.0, 13.6, 18.7, 67.0, 41.7, 10.4, 16.5, 7.2, 50.0, 19.6, 36.2, 12.7, 24.0, 10.2, 32.2, 24.3, 19.5, 21.9, 23.4, 11.7, 5.0, 13.8, 34.6, 26.7, 33.3, 23.2, 16.8, 32.0, 19.4]
final_rmse4.881660683530373prepared_data[0]array([-0.42152521, -0.48685178, -0.24673925, -0.27144836, 0.2311586 ,
-0.85631303, -1.42946756, -0.4510327 , -0.42117544, -0.12039257,
0.3268577 , 0.41580739, 0.64788652])

Using the model

from joblib import dump, load
import numpy as np
model = load('Dragon.joblib')
features = np.array([[-5.43942006, 4.12628155, -1.6165014, -0.67288841, -1.42262747,
-11.44443979304, -49.31238772, 7.61111401, -26.0016879 , -0.5778192 ,
-0.97491834, 0.41164221, -66.86091034]])
model.predict(features)
array([24.26])

Machine Learning | Deep Learning | Kaggle Expert | Computer Engineering Student

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store