Data Engineer & DataOps
My LinkedIn
My GitHub
import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import seaborn as sns
df = pd.read_csv('AmesHousing.txt', delimiter='\t')
df
Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | RL | 141.0 | 31770 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | Stone | 112.0 | TA | TA | CBlock | TA | Gd | Gd | BLQ | 639.0 | Unf | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | SBrkr | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | Gd | Attchd | 1960.0 | Fin | 2.0 | 528.0 | TA | TA | P | 210 | 62 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 215000 |
1 | 2 | 526350040 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal | 105000 |
2 | 3 | 526351010 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 |
3 | 4 | 526353030 | 20 | RL | 93.0 | 11160 | Pave | NaN | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | None | 0.0 | Gd | TA | CBlock | TA | TA | No | ALQ | 1065.0 | Unf | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | SBrkr | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | TA | Attchd | 1968.0 | Fin | 2.0 | 522.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 244000 |
4 | 5 | 527105010 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal | 189900 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2925 | 2926 | 923275080 | 80 | RL | 37.0 | 7937 | Pave | NaN | IR1 | Lvl | AllPub | CulDSac | Gtl | Mitchel | Norm | Norm | 1Fam | SLvl | 6 | 6 | 1984 | 1984 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | TA | TA | Av | GLQ | 819.0 | Unf | 0.0 | 184.0 | 1003.0 | GasA | TA | Y | SBrkr | 1003 | 0 | 0 | 1003 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | NaN | Detchd | 1984.0 | Unf | 2.0 | 588.0 | TA | TA | Y | 120 | 0 | 0 | 0 | 0 | 0 | NaN | GdPrv | NaN | 0 | 3 | 2006 | WD | Normal | 142500 |
2926 | 2927 | 923276100 | 20 | RL | NaN | 8885 | Pave | NaN | IR1 | Low | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1983 | 1983 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | Gd | TA | Av | BLQ | 301.0 | ALQ | 324.0 | 239.0 | 864.0 | GasA | TA | Y | SBrkr | 902 | 0 | 0 | 902 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1983.0 | Unf | 2.0 | 484.0 | TA | TA | Y | 164 | 0 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2006 | WD | Normal | 131000 |
2927 | 2928 | 923400125 | 85 | RL | 62.0 | 10441 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | SFoyer | 5 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | Wd Shng | None | 0.0 | TA | TA | PConc | Gd | TA | Av | GLQ | 337.0 | Unf | 0.0 | 575.0 | 912.0 | GasA | TA | Y | SBrkr | 970 | 0 | 0 | 970 | 0.0 | 1.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN | Y | 80 | 32 | 0 | 0 | 0 | 0 | NaN | MnPrv | Shed | 700 | 7 | 2006 | WD | Normal | 132000 |
2928 | 2929 | 924100070 | 20 | RL | 77.0 | 10010 | Pave | NaN | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1974 | 1975 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | TA | CBlock | Gd | TA | Av | ALQ | 1071.0 | LwQ | 123.0 | 195.0 | 1389.0 | GasA | Gd | Y | SBrkr | 1389 | 0 | 0 | 1389 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1975.0 | RFn | 2.0 | 418.0 | TA | TA | Y | 240 | 38 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2006 | WD | Normal | 170000 |
2929 | 2930 | 924151050 | 60 | RL | 74.0 | 9627 | Pave | NaN | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1993 | 1994 | Gable | CompShg | HdBoard | HdBoard | BrkFace | 94.0 | TA | TA | PConc | Gd | TA | Av | LwQ | 758.0 | Unf | 0.0 | 238.0 | 996.0 | GasA | Ex | Y | SBrkr | 996 | 1004 | 0 | 2000 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 9 | Typ | 1 | TA | Attchd | 1993.0 | Fin | 3.0 | 650.0 | TA | TA | Y | 190 | 48 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 11 | 2006 | WD | Normal | 188000 |
2930 rows × 82 columns
def transform_features(df):
data = df.copy()
# select text columns
t_df = data.select_dtypes(include='object')
t_isNull = t_df.isnull().sum()
# select columns with more than 1 missing values to drop
t_isNull_col = t_isNull[t_isNull >= 1].index
# drop selected columns
data = data.drop(t_isNull_col, axis=1)
# keep only columns with less than 5% missing values
isNull = data.isnull().sum()
data = data[isNull[isNull < len(data)*0.05].index]
# select numerical columns
num_df = data.select_dtypes(exclude='object')
# select suitable null columns to replace with mode
num_isNull = num_df.isnull().sum()
num_isNull_col = num_isNull[num_isNull != 0].index
# replace with mode
rep_val = data[num_isNull_col].mode().to_dict(orient='records')[0]
data = data.fillna(rep_val)
return data
def select_features(df, thred_corr):
corr = df.corr()['SalePrice'].abs().sort_values(ascending=False)
corr = corr[corr > thred_corr]
sel_df = df[corr.index]
return sel_df
def train_and_test(df, k):
np.random.seed(1)
num_df = df.select_dtypes(exclude='object')
num_df = num_df.reindex(np.random.permutation(num_df.index))
features = num_df.columns.drop('SalePrice')
lr = LinearRegression()
if k == 0:
train = num_df[:int(len(num_df)/2)]
test = df[int(len(num_df)/2):]
lr.fit(train[features], train['SalePrice'])
testP = lr.predict(test[features])
rmse = mean_squared_error(test['SalePrice'], testP, squared=False)
return rmse
elif k == 1:
train = num_df[:int(len(num_df)/2)]
test = df[int(len(num_df)/2):]
lr.fit(train[features], train['SalePrice'])
p1 = lr.predict(test[features])
rmse_1 = mean_squared_error(test['SalePrice'], p1, squared=False)
lr.fit(test[features], test['SalePrice'])
p2 = lr.predict(train[features])
rmse_2 = mean_squared_error(train['SalePrice'], p2, squared=False)
return np.mean([rmse_1, rmse_2])
else:
holder = list()
kf = KFold(n_splits=k, shuffle=True)
for train_index, test_index in kf.split(num_df):
train = num_df.iloc[train_index]
test = num_df.iloc[test_index]
#print("TRAIN:", train_index, "TEST:", test_index)
#print("TRAIN:", train, "TEST:", test)
lr.fit(train[features], train['SalePrice'])
testP = lr.predict(test[features])
rmse = mean_squared_error(test['SalePrice'], testP, squared=False)
holder.append(rmse)
return np.mean(rmse)
clean_df = transform_features(df)
clean_df
Order | PID | MS SubClass | MS Zoning | Lot Area | Street | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Garage Cars | Garage Area | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | RL | 31770 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | 112.0 | TA | TA | CBlock | 639.0 | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | 2.0 | 528.0 | P | 210 | 62 | 0 | 0 | 0 | 0 | 0 | 5 | 2010 | WD | Normal | 215000 |
1 | 2 | 526350040 | 20 | RH | 11622 | Pave | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | 0.0 | TA | TA | CBlock | 468.0 | 144.0 | 270.0 | 882.0 | GasA | TA | Y | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | 1.0 | 730.0 | Y | 140 | 0 | 0 | 0 | 120 | 0 | 0 | 6 | 2010 | WD | Normal | 105000 |
2 | 3 | 526351010 | 20 | RL | 14267 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | 108.0 | TA | TA | CBlock | 923.0 | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | 1.0 | 312.0 | Y | 393 | 36 | 0 | 0 | 0 | 0 | 12500 | 6 | 2010 | WD | Normal | 172000 |
3 | 4 | 526353030 | 20 | RL | 11160 | Pave | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | 0.0 | Gd | TA | CBlock | 1065.0 | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | 2.0 | 522.0 | Y | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2010 | WD | Normal | 244000 |
4 | 5 | 527105010 | 60 | RL | 13830 | Pave | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | 0.0 | TA | TA | PConc | 791.0 | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | 2.0 | 482.0 | Y | 212 | 34 | 0 | 0 | 0 | 0 | 0 | 3 | 2010 | WD | Normal | 189900 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2925 | 2926 | 923275080 | 80 | RL | 7937 | Pave | IR1 | Lvl | AllPub | CulDSac | Gtl | Mitchel | Norm | Norm | 1Fam | SLvl | 6 | 6 | 1984 | 1984 | Gable | CompShg | HdBoard | HdBoard | 0.0 | TA | TA | CBlock | 819.0 | 0.0 | 184.0 | 1003.0 | GasA | TA | Y | 1003 | 0 | 0 | 1003 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | 2.0 | 588.0 | Y | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2006 | WD | Normal | 142500 |
2926 | 2927 | 923276100 | 20 | RL | 8885 | Pave | IR1 | Low | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1983 | 1983 | Gable | CompShg | HdBoard | HdBoard | 0.0 | TA | TA | CBlock | 301.0 | 324.0 | 239.0 | 864.0 | GasA | TA | Y | 902 | 0 | 0 | 902 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | 2.0 | 484.0 | Y | 164 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 2006 | WD | Normal | 131000 |
2927 | 2928 | 923400125 | 85 | RL | 10441 | Pave | Reg | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | SFoyer | 5 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | Wd Shng | 0.0 | TA | TA | PConc | 337.0 | 0.0 | 575.0 | 912.0 | GasA | TA | Y | 970 | 0 | 0 | 970 | 0.0 | 1.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | 0.0 | 0.0 | Y | 80 | 32 | 0 | 0 | 0 | 0 | 700 | 7 | 2006 | WD | Normal | 132000 |
2928 | 2929 | 924100070 | 20 | RL | 10010 | Pave | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1974 | 1975 | Gable | CompShg | HdBoard | HdBoard | 0.0 | TA | TA | CBlock | 1071.0 | 123.0 | 195.0 | 1389.0 | GasA | Gd | Y | 1389 | 0 | 0 | 1389 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 6 | Typ | 1 | 2.0 | 418.0 | Y | 240 | 38 | 0 | 0 | 0 | 0 | 0 | 4 | 2006 | WD | Normal | 170000 |
2929 | 2930 | 924151050 | 60 | RL | 9627 | Pave | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1993 | 1994 | Gable | CompShg | HdBoard | HdBoard | 94.0 | TA | TA | PConc | 758.0 | 0.0 | 238.0 | 996.0 | GasA | Ex | Y | 996 | 1004 | 0 | 2000 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 9 | Typ | 1 | 3.0 | 650.0 | Y | 190 | 48 | 0 | 0 | 0 | 0 | 0 | 11 | 2006 | WD | Normal | 188000 |
2930 rows × 64 columns
# add new columns
clean_df['Year Since Sold'] = clean_df['Yr Sold'] - clean_df['Year Built']
clean_df['Year Since Re'] = clean_df['Yr Sold'] - clean_df['Year Remod/Add']
# keep only valid values from these columns
clean_df = clean_df[(clean_df['Year Since Sold'] > 0) &
(clean_df['Year Since Re'] > 0)]
# keep only relevant features
clean_df.drop(['Mo Sold', 'Yr Sold', 'Sale Type', 'Sale Condition', 'PID', 'Order'],
axis=1, inplace=True)
/Users/MimiHMB/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3997: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
errors=errors,
clean_df
MS SubClass | MS Zoning | Lot Area | Street | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Garage Cars | Garage Area | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | SalePrice | Year Since Sold | Year Since Re | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20 | RL | 31770 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | 112.0 | TA | TA | CBlock | 639.0 | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | 2.0 | 528.0 | P | 210 | 62 | 0 | 0 | 0 | 0 | 0 | 215000 | 50 | 50 |
1 | 20 | RH | 11622 | Pave | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | 0.0 | TA | TA | CBlock | 468.0 | 144.0 | 270.0 | 882.0 | GasA | TA | Y | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | 1.0 | 730.0 | Y | 140 | 0 | 0 | 0 | 120 | 0 | 0 | 105000 | 49 | 49 |
2 | 20 | RL | 14267 | Pave | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | 108.0 | TA | TA | CBlock | 923.0 | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | 1.0 | 312.0 | Y | 393 | 36 | 0 | 0 | 0 | 0 | 12500 | 172000 | 52 | 52 |
3 | 20 | RL | 11160 | Pave | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | 0.0 | Gd | TA | CBlock | 1065.0 | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | 2.0 | 522.0 | Y | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 244000 | 42 | 42 |
4 | 60 | RL | 13830 | Pave | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | 0.0 | TA | TA | PConc | 791.0 | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | 2.0 | 482.0 | Y | 212 | 34 | 0 | 0 | 0 | 0 | 0 | 189900 | 13 | 12 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2925 | 80 | RL | 7937 | Pave | IR1 | Lvl | AllPub | CulDSac | Gtl | Mitchel | Norm | Norm | 1Fam | SLvl | 6 | 6 | 1984 | 1984 | Gable | CompShg | HdBoard | HdBoard | 0.0 | TA | TA | CBlock | 819.0 | 0.0 | 184.0 | 1003.0 | GasA | TA | Y | 1003 | 0 | 0 | 1003 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | 2.0 | 588.0 | Y | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 142500 | 22 | 22 |
2926 | 20 | RL | 8885 | Pave | IR1 | Low | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1983 | 1983 | Gable | CompShg | HdBoard | HdBoard | 0.0 | TA | TA | CBlock | 301.0 | 324.0 | 239.0 | 864.0 | GasA | TA | Y | 902 | 0 | 0 | 902 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | 2.0 | 484.0 | Y | 164 | 0 | 0 | 0 | 0 | 0 | 0 | 131000 | 23 | 23 |
2927 | 85 | RL | 10441 | Pave | Reg | Lvl | AllPub | Inside | Gtl | Mitchel | Norm | Norm | 1Fam | SFoyer | 5 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | Wd Shng | 0.0 | TA | TA | PConc | 337.0 | 0.0 | 575.0 | 912.0 | GasA | TA | Y | 970 | 0 | 0 | 970 | 0.0 | 1.0 | 1 | 0 | 3 | 1 | TA | 6 | Typ | 0 | 0.0 | 0.0 | Y | 80 | 32 | 0 | 0 | 0 | 0 | 700 | 132000 | 14 | 14 |
2928 | 20 | RL | 10010 | Pave | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 1Story | 5 | 5 | 1974 | 1975 | Gable | CompShg | HdBoard | HdBoard | 0.0 | TA | TA | CBlock | 1071.0 | 123.0 | 195.0 | 1389.0 | GasA | Gd | Y | 1389 | 0 | 0 | 1389 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 6 | Typ | 1 | 2.0 | 418.0 | Y | 240 | 38 | 0 | 0 | 0 | 0 | 0 | 170000 | 32 | 31 |
2929 | 60 | RL | 9627 | Pave | Reg | Lvl | AllPub | Inside | Mod | Mitchel | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1993 | 1994 | Gable | CompShg | HdBoard | HdBoard | 94.0 | TA | TA | PConc | 758.0 | 0.0 | 238.0 | 996.0 | GasA | Ex | Y | 996 | 1004 | 0 | 2000 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 9 | Typ | 1 | 3.0 | 650.0 | Y | 190 | 48 | 0 | 0 | 0 | 0 | 0 | 188000 | 13 | 12 |
2689 rows × 60 columns
# recheck if there is any missing value
any(clean_df.isnull().sum() > 0)
False
sel_df = select_features(clean_df, 0.4)
sel_df
SalePrice | Overall Qual | Gr Liv Area | Total Bsmt SF | 1st Flr SF | Garage Cars | Garage Area | Full Bath | Year Built | Year Since Sold | Year Since Re | Year Remod/Add | Fireplaces | TotRms AbvGrd | Mas Vnr Area | BsmtFin SF 1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 215000 | 6 | 1656 | 1080.0 | 1656 | 2.0 | 528.0 | 1 | 1960 | 50 | 50 | 1960 | 2 | 7 | 112.0 | 639.0 |
1 | 105000 | 5 | 896 | 882.0 | 896 | 1.0 | 730.0 | 1 | 1961 | 49 | 49 | 1961 | 0 | 5 | 0.0 | 468.0 |
2 | 172000 | 6 | 1329 | 1329.0 | 1329 | 1.0 | 312.0 | 1 | 1958 | 52 | 52 | 1958 | 0 | 6 | 108.0 | 923.0 |
3 | 244000 | 7 | 2110 | 2110.0 | 2110 | 2.0 | 522.0 | 2 | 1968 | 42 | 42 | 1968 | 2 | 8 | 0.0 | 1065.0 |
4 | 189900 | 5 | 1629 | 928.0 | 928 | 2.0 | 482.0 | 2 | 1997 | 13 | 12 | 1998 | 1 | 6 | 0.0 | 791.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2925 | 142500 | 6 | 1003 | 1003.0 | 1003 | 2.0 | 588.0 | 1 | 1984 | 22 | 22 | 1984 | 0 | 6 | 0.0 | 819.0 |
2926 | 131000 | 5 | 902 | 864.0 | 902 | 2.0 | 484.0 | 1 | 1983 | 23 | 23 | 1983 | 0 | 5 | 0.0 | 301.0 |
2927 | 132000 | 5 | 970 | 912.0 | 970 | 0.0 | 0.0 | 1 | 1992 | 14 | 14 | 1992 | 0 | 6 | 0.0 | 337.0 |
2928 | 170000 | 5 | 1389 | 1389.0 | 1389 | 2.0 | 418.0 | 1 | 1974 | 32 | 31 | 1975 | 1 | 6 | 0.0 | 1071.0 |
2929 | 188000 | 7 | 2000 | 996.0 | 996 | 3.0 | 650.0 | 2 | 1993 | 13 | 12 | 1994 | 1 | 9 | 94.0 | 758.0 |
2689 rows × 16 columns
# make a dictionary of value counts for all columns in clean_df (the df resulted from initial aggregation)
holder = dict()
for i in clean_df.columns:
holder[i] = len(clean_df[i].value_counts())
holder
{'MS SubClass': 16,
'MS Zoning': 7,
'Lot Area': 1795,
'Street': 2,
'Lot Shape': 4,
'Land Contour': 4,
'Utilities': 3,
'Lot Config': 5,
'Land Slope': 3,
'Neighborhood': 28,
'Condition 1': 9,
'Condition 2': 8,
'Bldg Type': 5,
'House Style': 8,
'Overall Qual': 10,
'Overall Cond': 9,
'Year Built': 117,
'Year Remod/Add': 60,
'Roof Style': 6,
'Roof Matl': 7,
'Exterior 1st': 16,
'Exterior 2nd': 16,
'Mas Vnr Area': 413,
'Exter Qual': 4,
'Exter Cond': 5,
'Foundation': 6,
'BsmtFin SF 1': 942,
'BsmtFin SF 2': 273,
'Bsmt Unf SF': 1060,
'Total Bsmt SF': 978,
'Heating': 6,
'Heating QC': 5,
'Central Air': 2,
'1st Flr SF': 1028,
'2nd Flr SF': 597,
'Low Qual Fin SF': 36,
'Gr Liv Area': 1226,
'Bsmt Full Bath': 4,
'Bsmt Half Bath': 3,
'Full Bath': 5,
'Half Bath': 3,
'Bedroom AbvGr': 8,
'Kitchen AbvGr': 4,
'Kitchen Qual': 5,
'TotRms AbvGrd': 13,
'Functional': 8,
'Fireplaces': 5,
'Garage Cars': 6,
'Garage Area': 570,
'Paved Drive': 3,
'Wood Deck SF': 376,
'Open Porch SF': 238,
'Enclosed Porch': 181,
'3Ssn Porch': 27,
'Screen Porch': 119,
'Pool Area': 13,
'Misc Val': 37,
'SalePrice': 864,
'Year Since Sold': 126,
'Year Since Re': 60}
# nominal columns given by the data set description
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood",
"Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st",
"Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type",
"Misc Feature", "Sale Type", "Sale Condition"]
# choose only nominal features that have at most 10 unique values
to_cat = list()
for i in holder.keys():
if (i in nominal_features) & (holder[i] <= 10):
to_cat.append(i)
to_cat
['MS Zoning',
'Street',
'Land Contour',
'Lot Config',
'Condition 1',
'Condition 2',
'Bldg Type',
'House Style',
'Roof Style',
'Roof Matl',
'Foundation',
'Heating',
'Central Air']
# categorize these nominal features and make corresponding dummies
for i in to_cat:
sel_df[i] = clean_df[i].astype('category')
sel_df = pd.concat([sel_df,
pd.get_dummies(sel_df.select_dtypes(include=['category']))], axis=1).drop(i, axis=1)
/Users/MimiHMB/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
sel_df
SalePrice | Overall Qual | Gr Liv Area | Total Bsmt SF | 1st Flr SF | Garage Cars | Garage Area | Full Bath | Year Built | Year Since Sold | Year Since Re | Year Remod/Add | Fireplaces | TotRms AbvGrd | Mas Vnr Area | BsmtFin SF 1 | MS Zoning_A (agr) | MS Zoning_C (all) | MS Zoning_FV | MS Zoning_I (all) | MS Zoning_RH | MS Zoning_RL | MS Zoning_RM | Street_Grvl | Street_Pave | Land Contour_Bnk | Land Contour_HLS | Land Contour_Low | Land Contour_Lvl | Lot Config_Corner | Lot Config_CulDSac | Lot Config_FR2 | Lot Config_FR3 | Lot Config_Inside | Condition 1_Artery | Condition 1_Feedr | Condition 1_Norm | Condition 1_PosA | Condition 1_PosN | Condition 1_RRAe | Condition 1_RRAn | Condition 1_RRNe | Condition 1_RRNn | Condition 2_Artery | Condition 2_Feedr | Condition 2_Norm | Condition 2_PosA | Condition 2_PosN | Condition 2_RRAe | Condition 2_RRAn | Condition 2_RRNn | Bldg Type_1Fam | Bldg Type_2fmCon | Bldg Type_Duplex | Bldg Type_Twnhs | Bldg Type_TwnhsE | House Style_1.5Fin | House Style_1.5Unf | House Style_1Story | House Style_2.5Fin | House Style_2.5Unf | House Style_2Story | House Style_SFoyer | House Style_SLvl | Roof Style_Flat | Roof Style_Gable | Roof Style_Gambrel | Roof Style_Hip | Roof Style_Mansard | Roof Style_Shed | Roof Matl_CompShg | Roof Matl_Membran | Roof Matl_Metal | Roof Matl_Roll | Roof Matl_Tar&Grv | Roof Matl_WdShake | Roof Matl_WdShngl | Foundation_BrkTil | Foundation_CBlock | Foundation_PConc | Foundation_Slab | Foundation_Stone | Foundation_Wood | Heating_Floor | Heating_GasA | Heating_GasW | Heating_Grav | Heating_OthW | Heating_Wall | Central Air_N | Central Air_Y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 215000 | 6 | 1656 | 1080.0 | 1656 | 2.0 | 528.0 | 1 | 1960 | 50 | 50 | 1960 | 2 | 7 | 112.0 | 639.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 105000 | 5 | 896 | 882.0 | 896 | 1.0 | 730.0 | 1 | 1961 | 49 | 49 | 1961 | 0 | 5 | 0.0 | 468.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 172000 | 6 | 1329 | 1329.0 | 1329 | 1.0 | 312.0 | 1 | 1958 | 52 | 52 | 1958 | 0 | 6 | 108.0 | 923.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 244000 | 7 | 2110 | 2110.0 | 2110 | 2.0 | 522.0 | 2 | 1968 | 42 | 42 | 1968 | 2 | 8 | 0.0 | 1065.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 189900 | 5 | 1629 | 928.0 | 928 | 2.0 | 482.0 | 2 | 1997 | 13 | 12 | 1998 | 1 | 6 | 0.0 | 791.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2925 | 142500 | 6 | 1003 | 1003.0 | 1003 | 2.0 | 588.0 | 1 | 1984 | 22 | 22 | 1984 | 0 | 6 | 0.0 | 819.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
2926 | 131000 | 5 | 902 | 864.0 | 902 | 2.0 | 484.0 | 1 | 1983 | 23 | 23 | 1983 | 0 | 5 | 0.0 | 301.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
2927 | 132000 | 5 | 970 | 912.0 | 970 | 0.0 | 0.0 | 1 | 1992 | 14 | 14 | 1992 | 0 | 6 | 0.0 | 337.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
2928 | 170000 | 5 | 1389 | 1389.0 | 1389 | 2.0 | 418.0 | 1 | 1974 | 32 | 31 | 1975 | 1 | 6 | 0.0 | 1071.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
2929 | 188000 | 7 | 2000 | 996.0 | 996 | 3.0 | 650.0 | 2 | 1993 | 13 | 12 | 1994 | 1 | 9 | 94.0 | 758.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
2689 rows × 91 columns
x = list(range(0, 30))
y = [train_and_test(sel_df, i) for i in x]
Lowest RMSE achieved at 30 folds
plt.plot(x, y)
plt.xlabel('k fold')
plt.ylabel('RMSE')
plt.show()