Boston House — Price Predictor
import pandas as pdhousing = pd.read_csv("C:\\Users\\Arslan Mushtaq\\Downloads\\archive(1)\\data.csv")housing.head()

housing.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CRIM 511 non-null float64
1 ZN 511 non-null float64
2 INDUS 511 non-null float64
3 CHAS 511 non-null int64
4 NOX 511 non-null float64
5 RM 506 non-null float64
6 AGE 511 non-null float64
7 DIS 511 non-null float64
8 RAD 511 non-null int64
9 TAX 511 non-null int64
10 PTRATIO 511 non-null float64
11 B 511 non-null float64
12 LSTAT 511 non-null float64
13 MEDV 511 non-null float64
dtypes: float64(11), int64(3)
memory usage: 56.0 KBhousing['CHAS'].value_counts()0 476
1 35
Name: CHAS, dtype: int64housing.describe()

%matplotlib inline# # For plotting histogram
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))array([[<AxesSubplot:title={'center':'AGE'}>,
<AxesSubplot:title={'center':'B'}>,
<AxesSubplot:title={'center':'CHAS'}>,
<AxesSubplot:title={'center':'CRIM'}>],
[<AxesSubplot:title={'center':'DIS'}>,
<AxesSubplot:title={'center':'INDUS'}>,
<AxesSubplot:title={'center':'LSTAT'}>,
<AxesSubplot:title={'center':'MEDV'}>],
[<AxesSubplot:title={'center':'NOX'}>,
<AxesSubplot:title={'center':'PTRATIO'}>,
<AxesSubplot:title={'center':'RAD'}>,
<AxesSubplot:title={'center':'RM'}>],
[<AxesSubplot:title={'center':'TAX'}>,
<AxesSubplot:title={'center':'ZN'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)

Train-Test Splitting
# For learning purpose
import numpy as np
def split_train_test(data, test_ratio):
np.random.seed(42)
shuffled = np.random.permutation(len(data))
print(shuffled)
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled[:test_set_size]
train_indices = shuffled[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
2nd Method for Training And Testing
# train_set, test_set = split_train_test(housing, 0.2)# print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")Rows in train set: 408
Rows in test set: 103from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]strat_test_set['CHAS'].value_counts()0 96
1 7
Name: CHAS, dtype: int64strat_train_set['CHAS'].value_counts()0 380
1 28
Name: CHAS, dtype: int6495/713.571428571428571376/2813.428571428571429housing = strat_train_set.copy()
Looking for Correlations
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)MEDV 1.000000
RM 0.681481
B 0.350283
ZN 0.341767
DIS 0.250670
CHAS 0.191563
AGE -0.366230
RAD -0.381955
CRIM -0.395849
NOX -0.425104
TAX -0.467171
PTRATIO -0.475870
INDUS -0.481414
LSTAT -0.720350
Name: MEDV, dtype: float64from pandas.plotting import scatter_matrix
attributes = ["MEDV", "RM", "ZN", "LSTAT"]
scatter_matrix(housing[attributes], figsize = (12,8))array([[<AxesSubplot:xlabel='MEDV', ylabel='MEDV'>,
<AxesSubplot:xlabel='RM', ylabel='MEDV'>,
<AxesSubplot:xlabel='ZN', ylabel='MEDV'>,
<AxesSubplot:xlabel='LSTAT', ylabel='MEDV'>],
[<AxesSubplot:xlabel='MEDV', ylabel='RM'>,
<AxesSubplot:xlabel='RM', ylabel='RM'>,
<AxesSubplot:xlabel='ZN', ylabel='RM'>,
<AxesSubplot:xlabel='LSTAT', ylabel='RM'>],
[<AxesSubplot:xlabel='MEDV', ylabel='ZN'>,
<AxesSubplot:xlabel='RM', ylabel='ZN'>,
<AxesSubplot:xlabel='ZN', ylabel='ZN'>,
<AxesSubplot:xlabel='LSTAT', ylabel='ZN'>],
[<AxesSubplot:xlabel='MEDV', ylabel='LSTAT'>,
<AxesSubplot:xlabel='RM', ylabel='LSTAT'>,
<AxesSubplot:xlabel='ZN', ylabel='LSTAT'>,
<AxesSubplot:xlabel='LSTAT', ylabel='LSTAT'>]], dtype=object)

housing.plot(kind="scatter", x="RM", y="MEDV", alpha=0.8)<AxesSubplot:xlabel='RM', ylabel='MEDV'>

Trying out Attribute combinations
housing["TAXRM"] = housing['TAX']/housing['RM']housing.head()

corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)MEDV 1.000000
RM 0.681481
B 0.350283
ZN 0.341767
DIS 0.250670
CHAS 0.191563
AGE -0.366230
RAD -0.381955
CRIM -0.395849
NOX -0.425104
TAX -0.467171
PTRATIO -0.475870
INDUS -0.481414
TAXRM -0.535069
LSTAT -0.720350
Name: MEDV, dtype: float64housing.plot(kind="scatter", x="TAXRM", y="MEDV", alpha=0.8)<AxesSubplot:xlabel='TAXRM', ylabel='MEDV'>

housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()
Missing Attributes
# To take care of missing attributes, you have three options:
# 1. Get rid of the missing data points
# 2. Get rid of the whole attribute
# 3. Set the value to some value(0, mean or median)a = housing.dropna(subset=["RM"]) #Option 1
a.shape
# Note that the original housing dataframe will remain unchanged(403, 13)housing.drop("RM", axis=1).shape # Option 2
# Note that there is no RM column and also note that the original housing dataframe will remain unchanged(408, 12)median = housing["RM"].median() # Compute median for Option 3housing["RM"].fillna(median) # Option 3
# Note that the original housing dataframe will remain unchanged495 5.670
254 6.108
365 3.561
193 6.800
35 6.208
...
386 4.652
243 6.393
22 6.142
480 6.242
455 6.525
Name: RM, Length: 408, dtype: float64housing.shape(408, 13)housing.describe() # before we started filling missing attributes

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)SimpleImputer(strategy='median')imputer.statistics_array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
6.20800e+00, 7.79500e+01, 3.10730e+00, 5.00000e+00, 3.35000e+02,
1.90500e+01, 3.90980e+02, 1.17050e+01])X = imputer.transform(housing)housing_tr = pd.DataFrame(X, columns=housing.columns)housing_tr.describe()

Scikit-learn Design
Primarily, three types of objects
- Estimators — It estimates some parameter based on a dataset. Eg. imputer. It has a fit method and transform method. Fit method — Fits the dataset and calculates internal parameters
- Transformers — transform method takes input and returns output based on the learnings from fit(). It also has a convenience function called fit_transform() which fits and then transforms.
- Predictors — LinearRegression model is an example of predictor. fit() and predict() are two common functions. It also gives score() function which will evaluate the predictions.
Feature Scaling
Primarily, two types of feature scaling methods:
- Min-max scaling (Normalization) (value — min)/(max — min) Sklearn provides a class called MinMaxScaler for this
- Standardization (value — mean)/std Sklearn provides a class called StandardScaler for this
Creating a Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
# ..... add as many as you want in your pipeline
('std_scaler', StandardScaler()),
])housing_num_tr = my_pipeline.fit_transform(housing)housing_num_tr.shape(408, 13)
Selecting a desired model for Dragon Real Estates
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)RandomForestRegressor()some_data = housing.iloc[:5]some_labels = housing_labels.iloc[:5]prepared_data = my_pipeline.transform(some_data)model.predict(prepared_data)array([21.748, 22.391, 27.574, 30.62 , 20.548])list(some_labels)[23.1, 21.9, 27.5, 31.1, 18.9]
Evaluating the model
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)rmse1.5076935117888486
Using better evaluation technique — Cross Validation
from sklearn.metrics import r2_score# 1 2 3 4 5 6 7 8 9 10
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)rmse_scoresarray([3.07918078, 3.74030517, 4.90385085, 4.29989591, 3.00696454,
2.27306873, 6.86476013, 2.90492124, 3.38468456, 3.32520813])def print_scores(scores):
print("Scores:", scores)
print("Mean: ", scores.mean())
print("Standard deviation: ", scores.std())print_scores(rmse_scores)Scores: [3.07918078 3.74030517 4.90385085 4.29989591 3.00696454 2.27306873
6.86476013 2.90492124 3.38468456 3.32520813]
Mean: 3.778284003174746
Standard deviation: 1.246558053523797
Quiz: Convert this notebook into a python file and run the pipeline using Visual Studio Code
Saving the model
from joblib import dump, load
dump(model, 'Dragon.joblib')['Dragon.joblib']
Testing the model on test data
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))[21.677 27.305 23.505 26.071 33.286 23.538 23.962 20.566 22.833 20.708
10.716 22.184 27.266 20.117 14.888 18.516 18.891 19.56 18.603 30.954
26.097 17.884 31.356 15.755 24.494 18.51 14.758 31.662 19.401 19.927
26.366 19.579 44.863 30.435 21.185 41.317 33.413 22.699 27.043 22.154
20.099 13.894 22.188 29.413 24.3 11.971 8.584 33.796 42.761 20.492
23.544 36.146 15.006 25.768 19.273 29.083 19.15 18.957 24.013 20.661
20.234 33.596 11.051 21.132 22.764 21.56 15.365 20.567 19.611 18.95
30.78 21.264 29.118 20.077 47.323 15.337 20.099 36.954 44.614 16.403
24.902 9.933 44.136 20.735 25.446 15.725 37.658 11.599 31.505 21.01
18.366 43.46 20.664 14.163 6.919 12.478 33.819 29.267 34.242 22.043
17.898 32.772 19.638] [22.4, 25.0, 22.2, 24.4, 31.5, 23.3, 25.0, 24.5, 22.0, 20.3, 12.3, 21.2, 28.0, 23.8, 14.6, 20.8, 17.2, 19.5, 14.4, 28.5, 30.1, 19.6, 23.6, 19.4, 24.7, 19.9, 13.5, 30.3, 21.8, 20.4, 27.9, 18.9, 46.0, 23.9, 21.7, 44.0, 33.2, 21.6, 27.0, 19.8, 20.9, 16.7, 22.2, 24.8, 28.1, 12.7, 7.2, 37.3, 43.1, 23.8, 23.8, 50.0, 15.6, 27.1, 20.1, 29.1, 18.6, 19.5, 20.1, 21.1, 19.7, 33.1, 13.1, 20.4, 23.6, 20.6, 14.1, 22.7, 16.8, 21.5, 32.0, 20.3, 31.0, 19.2, 50.0, 13.6, 18.7, 67.0, 41.7, 10.4, 16.5, 7.2, 50.0, 19.6, 36.2, 12.7, 24.0, 10.2, 32.2, 24.3, 19.5, 21.9, 23.4, 11.7, 5.0, 13.8, 34.6, 26.7, 33.3, 23.2, 16.8, 32.0, 19.4]final_rmse4.881660683530373prepared_data[0]array([-0.42152521, -0.48685178, -0.24673925, -0.27144836, 0.2311586 ,
-0.85631303, -1.42946756, -0.4510327 , -0.42117544, -0.12039257,
0.3268577 , 0.41580739, 0.64788652])
Using the model
from joblib import dump, load
import numpy as np
model = load('Dragon.joblib')
features = np.array([[-5.43942006, 4.12628155, -1.6165014, -0.67288841, -1.42262747,
-11.44443979304, -49.31238772, 7.61111401, -26.0016879 , -0.5778192 ,
-0.97491834, 0.41164221, -66.86091034]])
model.predict(features)array([24.26])