My Bui (Mimi)

Data Engineer & DataOps

My LinkedIn
My GitHub

Multivariate linear regression: recommended prices for house

import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import seaborn as sns
df = pd.read_csv('AmesHousing.txt', delimiter='\t')
df
Order PID MS SubClass MS Zoning Lot Frontage Lot Area Street Alley Lot Shape Land Contour Utilities Lot Config Land Slope Neighborhood Condition 1 Condition 2 Bldg Type House Style Overall Qual Overall Cond Year Built Year Remod/Add Roof Style Roof Matl Exterior 1st Exterior 2nd Mas Vnr Type Mas Vnr Area Exter Qual Exter Cond Foundation Bsmt Qual Bsmt Cond Bsmt Exposure BsmtFin Type 1 BsmtFin SF 1 BsmtFin Type 2 BsmtFin SF 2 Bsmt Unf SF Total Bsmt SF Heating Heating QC Central Air Electrical 1st Flr SF 2nd Flr SF Low Qual Fin SF Gr Liv Area Bsmt Full Bath Bsmt Half Bath Full Bath Half Bath Bedroom AbvGr Kitchen AbvGr Kitchen Qual TotRms AbvGrd Functional Fireplaces Fireplace Qu Garage Type Garage Yr Blt Garage Finish Garage Cars Garage Area Garage Qual Garage Cond Paved Drive Wood Deck SF Open Porch SF Enclosed Porch 3Ssn Porch Screen Porch Pool Area Pool QC Fence Misc Feature Misc Val Mo Sold Yr Sold Sale Type Sale Condition SalePrice
0 1 526301100 20 RL 141.0 31770 Pave NaN IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 5 1960 1960 Hip CompShg BrkFace Plywood Stone 112.0 TA TA CBlock TA Gd Gd BLQ 639.0 Unf 0.0 441.0 1080.0 GasA Fa Y SBrkr 1656 0 0 1656 1.0 0.0 1 0 3 1 TA 7 Typ 2 Gd Attchd 1960.0 Fin 2.0 528.0 TA TA P 210 62 0 0 0 0 NaN NaN NaN 0 5 2010 WD Normal 215000
1 2 526350040 20 RH 80.0 11622 Pave NaN Reg Lvl AllPub Inside Gtl NAmes Feedr Norm 1Fam 1Story 5 6 1961 1961 Gable CompShg VinylSd VinylSd None 0.0 TA TA CBlock TA TA No Rec 468.0 LwQ 144.0 270.0 882.0 GasA TA Y SBrkr 896 0 0 896 0.0 0.0 1 0 2 1 TA 5 Typ 0 NaN Attchd 1961.0 Unf 1.0 730.0 TA TA Y 140 0 0 0 120 0 NaN MnPrv NaN 0 6 2010 WD Normal 105000
2 3 526351010 20 RL 81.0 14267 Pave NaN IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 6 1958 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace 108.0 TA TA CBlock TA TA No ALQ 923.0 Unf 0.0 406.0 1329.0 GasA TA Y SBrkr 1329 0 0 1329 0.0 0.0 1 1 3 1 Gd 6 Typ 0 NaN Attchd 1958.0 Unf 1.0 312.0 TA TA Y 393 36 0 0 0 0 NaN NaN Gar2 12500 6 2010 WD Normal 172000
3 4 526353030 20 RL 93.0 11160 Pave NaN Reg Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 7 5 1968 1968 Hip CompShg BrkFace BrkFace None 0.0 Gd TA CBlock TA TA No ALQ 1065.0 Unf 0.0 1045.0 2110.0 GasA Ex Y SBrkr 2110 0 0 2110 1.0 0.0 2 1 3 1 Ex 8 Typ 2 TA Attchd 1968.0 Fin 2.0 522.0 TA TA Y 0 0 0 0 0 0 NaN NaN NaN 0 4 2010 WD Normal 244000
4 5 527105010 60 RL 74.0 13830 Pave NaN IR1 Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 2Story 5 5 1997 1998 Gable CompShg VinylSd VinylSd None 0.0 TA TA PConc Gd TA No GLQ 791.0 Unf 0.0 137.0 928.0 GasA Gd Y SBrkr 928 701 0 1629 0.0 0.0 2 1 3 1 TA 6 Typ 1 TA Attchd 1997.0 Fin 2.0 482.0 TA TA Y 212 34 0 0 0 0 NaN MnPrv NaN 0 3 2010 WD Normal 189900
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2925 2926 923275080 80 RL 37.0 7937 Pave NaN IR1 Lvl AllPub CulDSac Gtl Mitchel Norm Norm 1Fam SLvl 6 6 1984 1984 Gable CompShg HdBoard HdBoard None 0.0 TA TA CBlock TA TA Av GLQ 819.0 Unf 0.0 184.0 1003.0 GasA TA Y SBrkr 1003 0 0 1003 1.0 0.0 1 0 3 1 TA 6 Typ 0 NaN Detchd 1984.0 Unf 2.0 588.0 TA TA Y 120 0 0 0 0 0 NaN GdPrv NaN 0 3 2006 WD Normal 142500
2926 2927 923276100 20 RL NaN 8885 Pave NaN IR1 Low AllPub Inside Mod Mitchel Norm Norm 1Fam 1Story 5 5 1983 1983 Gable CompShg HdBoard HdBoard None 0.0 TA TA CBlock Gd TA Av BLQ 301.0 ALQ 324.0 239.0 864.0 GasA TA Y SBrkr 902 0 0 902 1.0 0.0 1 0 2 1 TA 5 Typ 0 NaN Attchd 1983.0 Unf 2.0 484.0 TA TA Y 164 0 0 0 0 0 NaN MnPrv NaN 0 6 2006 WD Normal 131000
2927 2928 923400125 85 RL 62.0 10441 Pave NaN Reg Lvl AllPub Inside Gtl Mitchel Norm Norm 1Fam SFoyer 5 5 1992 1992 Gable CompShg HdBoard Wd Shng None 0.0 TA TA PConc Gd TA Av GLQ 337.0 Unf 0.0 575.0 912.0 GasA TA Y SBrkr 970 0 0 970 0.0 1.0 1 0 3 1 TA 6 Typ 0 NaN NaN NaN NaN 0.0 0.0 NaN NaN Y 80 32 0 0 0 0 NaN MnPrv Shed 700 7 2006 WD Normal 132000
2928 2929 924100070 20 RL 77.0 10010 Pave NaN Reg Lvl AllPub Inside Mod Mitchel Norm Norm 1Fam 1Story 5 5 1974 1975 Gable CompShg HdBoard HdBoard None 0.0 TA TA CBlock Gd TA Av ALQ 1071.0 LwQ 123.0 195.0 1389.0 GasA Gd Y SBrkr 1389 0 0 1389 1.0 0.0 1 0 2 1 TA 6 Typ 1 TA Attchd 1975.0 RFn 2.0 418.0 TA TA Y 240 38 0 0 0 0 NaN NaN NaN 0 4 2006 WD Normal 170000
2929 2930 924151050 60 RL 74.0 9627 Pave NaN Reg Lvl AllPub Inside Mod Mitchel Norm Norm 1Fam 2Story 7 5 1993 1994 Gable CompShg HdBoard HdBoard BrkFace 94.0 TA TA PConc Gd TA Av LwQ 758.0 Unf 0.0 238.0 996.0 GasA Ex Y SBrkr 996 1004 0 2000 0.0 0.0 2 1 3 1 TA 9 Typ 1 TA Attchd 1993.0 Fin 3.0 650.0 TA TA Y 190 48 0 0 0 0 NaN NaN NaN 0 11 2006 WD Normal 188000

2930 rows × 82 columns

def transform_features(df):
    data = df.copy()
    # select text columns
    t_df = data.select_dtypes(include='object')
    t_isNull = t_df.isnull().sum()
    # select columns with more than 1 missing values to drop
    t_isNull_col = t_isNull[t_isNull >= 1].index
    # drop selected columns
    data = data.drop(t_isNull_col, axis=1)
    
    # keep only columns with less than 5% missing values
    isNull = data.isnull().sum()
    data = data[isNull[isNull < len(data)*0.05].index]
    
    # select numerical columns
    num_df = data.select_dtypes(exclude='object')
    # select suitable null columns to replace with mode
    num_isNull = num_df.isnull().sum()
    num_isNull_col = num_isNull[num_isNull != 0].index
    # replace with mode
    rep_val = data[num_isNull_col].mode().to_dict(orient='records')[0]
    data = data.fillna(rep_val)
    return data

def select_features(df, thred_corr):
    corr = df.corr()['SalePrice'].abs().sort_values(ascending=False)
    corr = corr[corr > thred_corr]
    sel_df = df[corr.index]
    return sel_df

def train_and_test(df, k):
    np.random.seed(1)
    num_df = df.select_dtypes(exclude='object')
    num_df = num_df.reindex(np.random.permutation(num_df.index))
    features = num_df.columns.drop('SalePrice')
    lr = LinearRegression()
    if k == 0:
        train = num_df[:int(len(num_df)/2)]
        test = df[int(len(num_df)/2):]
        lr.fit(train[features], train['SalePrice'])
        testP = lr.predict(test[features])
        rmse = mean_squared_error(test['SalePrice'], testP, squared=False)
        return rmse
    elif k == 1:
        train = num_df[:int(len(num_df)/2)]
        test = df[int(len(num_df)/2):]
        lr.fit(train[features], train['SalePrice'])
        p1 = lr.predict(test[features])
        rmse_1 = mean_squared_error(test['SalePrice'], p1, squared=False)
        lr.fit(test[features], test['SalePrice'])
        p2 = lr.predict(train[features])
        rmse_2 = mean_squared_error(train['SalePrice'], p2, squared=False)
        return np.mean([rmse_1, rmse_2])
    else:
        holder = list()
        kf = KFold(n_splits=k, shuffle=True)
        for train_index, test_index in kf.split(num_df):
            train = num_df.iloc[train_index]
            test = num_df.iloc[test_index]
            #print("TRAIN:", train_index, "TEST:", test_index)
            #print("TRAIN:", train, "TEST:", test)
            lr.fit(train[features], train['SalePrice'])
            testP = lr.predict(test[features])
            rmse = mean_squared_error(test['SalePrice'], testP, squared=False)
            holder.append(rmse)
        return np.mean(rmse)

Feature Engineering

clean_df = transform_features(df)
clean_df
Order PID MS SubClass MS Zoning Lot Area Street Lot Shape Land Contour Utilities Lot Config Land Slope Neighborhood Condition 1 Condition 2 Bldg Type House Style Overall Qual Overall Cond Year Built Year Remod/Add Roof Style Roof Matl Exterior 1st Exterior 2nd Mas Vnr Area Exter Qual Exter Cond Foundation BsmtFin SF 1 BsmtFin SF 2 Bsmt Unf SF Total Bsmt SF Heating Heating QC Central Air 1st Flr SF 2nd Flr SF Low Qual Fin SF Gr Liv Area Bsmt Full Bath Bsmt Half Bath Full Bath Half Bath Bedroom AbvGr Kitchen AbvGr Kitchen Qual TotRms AbvGrd Functional Fireplaces Garage Cars Garage Area Paved Drive Wood Deck SF Open Porch SF Enclosed Porch 3Ssn Porch Screen Porch Pool Area Misc Val Mo Sold Yr Sold Sale Type Sale Condition SalePrice
0 1 526301100 20 RL 31770 Pave IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 5 1960 1960 Hip CompShg BrkFace Plywood 112.0 TA TA CBlock 639.0 0.0 441.0 1080.0 GasA Fa Y 1656 0 0 1656 1.0 0.0 1 0 3 1 TA 7 Typ 2 2.0 528.0 P 210 62 0 0 0 0 0 5 2010 WD Normal 215000
1 2 526350040 20 RH 11622 Pave Reg Lvl AllPub Inside Gtl NAmes Feedr Norm 1Fam 1Story 5 6 1961 1961 Gable CompShg VinylSd VinylSd 0.0 TA TA CBlock 468.0 144.0 270.0 882.0 GasA TA Y 896 0 0 896 0.0 0.0 1 0 2 1 TA 5 Typ 0 1.0 730.0 Y 140 0 0 0 120 0 0 6 2010 WD Normal 105000
2 3 526351010 20 RL 14267 Pave IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 6 1958 1958 Hip CompShg Wd Sdng Wd Sdng 108.0 TA TA CBlock 923.0 0.0 406.0 1329.0 GasA TA Y 1329 0 0 1329 0.0 0.0 1 1 3 1 Gd 6 Typ 0 1.0 312.0 Y 393 36 0 0 0 0 12500 6 2010 WD Normal 172000
3 4 526353030 20 RL 11160 Pave Reg Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 7 5 1968 1968 Hip CompShg BrkFace BrkFace 0.0 Gd TA CBlock 1065.0 0.0 1045.0 2110.0 GasA Ex Y 2110 0 0 2110 1.0 0.0 2 1 3 1 Ex 8 Typ 2 2.0 522.0 Y 0 0 0 0 0 0 0 4 2010 WD Normal 244000
4 5 527105010 60 RL 13830 Pave IR1 Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 2Story 5 5 1997 1998 Gable CompShg VinylSd VinylSd 0.0 TA TA PConc 791.0 0.0 137.0 928.0 GasA Gd Y 928 701 0 1629 0.0 0.0 2 1 3 1 TA 6 Typ 1 2.0 482.0 Y 212 34 0 0 0 0 0 3 2010 WD Normal 189900
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2925 2926 923275080 80 RL 7937 Pave IR1 Lvl AllPub CulDSac Gtl Mitchel Norm Norm 1Fam SLvl 6 6 1984 1984 Gable CompShg HdBoard HdBoard 0.0 TA TA CBlock 819.0 0.0 184.0 1003.0 GasA TA Y 1003 0 0 1003 1.0 0.0 1 0 3 1 TA 6 Typ 0 2.0 588.0 Y 120 0 0 0 0 0 0 3 2006 WD Normal 142500
2926 2927 923276100 20 RL 8885 Pave IR1 Low AllPub Inside Mod Mitchel Norm Norm 1Fam 1Story 5 5 1983 1983 Gable CompShg HdBoard HdBoard 0.0 TA TA CBlock 301.0 324.0 239.0 864.0 GasA TA Y 902 0 0 902 1.0 0.0 1 0 2 1 TA 5 Typ 0 2.0 484.0 Y 164 0 0 0 0 0 0 6 2006 WD Normal 131000
2927 2928 923400125 85 RL 10441 Pave Reg Lvl AllPub Inside Gtl Mitchel Norm Norm 1Fam SFoyer 5 5 1992 1992 Gable CompShg HdBoard Wd Shng 0.0 TA TA PConc 337.0 0.0 575.0 912.0 GasA TA Y 970 0 0 970 0.0 1.0 1 0 3 1 TA 6 Typ 0 0.0 0.0 Y 80 32 0 0 0 0 700 7 2006 WD Normal 132000
2928 2929 924100070 20 RL 10010 Pave Reg Lvl AllPub Inside Mod Mitchel Norm Norm 1Fam 1Story 5 5 1974 1975 Gable CompShg HdBoard HdBoard 0.0 TA TA CBlock 1071.0 123.0 195.0 1389.0 GasA Gd Y 1389 0 0 1389 1.0 0.0 1 0 2 1 TA 6 Typ 1 2.0 418.0 Y 240 38 0 0 0 0 0 4 2006 WD Normal 170000
2929 2930 924151050 60 RL 9627 Pave Reg Lvl AllPub Inside Mod Mitchel Norm Norm 1Fam 2Story 7 5 1993 1994 Gable CompShg HdBoard HdBoard 94.0 TA TA PConc 758.0 0.0 238.0 996.0 GasA Ex Y 996 1004 0 2000 0.0 0.0 2 1 3 1 TA 9 Typ 1 3.0 650.0 Y 190 48 0 0 0 0 0 11 2006 WD Normal 188000

2930 rows × 64 columns

# add new columns
clean_df['Year Since Sold'] = clean_df['Yr Sold'] - clean_df['Year Built']
clean_df['Year Since Re'] = clean_df['Yr Sold'] - clean_df['Year Remod/Add']
# keep only valid values from these columns
clean_df = clean_df[(clean_df['Year Since Sold'] > 0) &
                   (clean_df['Year Since Re'] > 0)]
# keep only relevant features
clean_df.drop(['Mo Sold', 'Yr Sold', 'Sale Type', 'Sale Condition', 'PID', 'Order'], 
              axis=1, inplace=True)
/Users/MimiHMB/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3997: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
clean_df
MS SubClass MS Zoning Lot Area Street Lot Shape Land Contour Utilities Lot Config Land Slope Neighborhood Condition 1 Condition 2 Bldg Type House Style Overall Qual Overall Cond Year Built Year Remod/Add Roof Style Roof Matl Exterior 1st Exterior 2nd Mas Vnr Area Exter Qual Exter Cond Foundation BsmtFin SF 1 BsmtFin SF 2 Bsmt Unf SF Total Bsmt SF Heating Heating QC Central Air 1st Flr SF 2nd Flr SF Low Qual Fin SF Gr Liv Area Bsmt Full Bath Bsmt Half Bath Full Bath Half Bath Bedroom AbvGr Kitchen AbvGr Kitchen Qual TotRms AbvGrd Functional Fireplaces Garage Cars Garage Area Paved Drive Wood Deck SF Open Porch SF Enclosed Porch 3Ssn Porch Screen Porch Pool Area Misc Val SalePrice Year Since Sold Year Since Re
0 20 RL 31770 Pave IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 5 1960 1960 Hip CompShg BrkFace Plywood 112.0 TA TA CBlock 639.0 0.0 441.0 1080.0 GasA Fa Y 1656 0 0 1656 1.0 0.0 1 0 3 1 TA 7 Typ 2 2.0 528.0 P 210 62 0 0 0 0 0 215000 50 50
1 20 RH 11622 Pave Reg Lvl AllPub Inside Gtl NAmes Feedr Norm 1Fam 1Story 5 6 1961 1961 Gable CompShg VinylSd VinylSd 0.0 TA TA CBlock 468.0 144.0 270.0 882.0 GasA TA Y 896 0 0 896 0.0 0.0 1 0 2 1 TA 5 Typ 0 1.0 730.0 Y 140 0 0 0 120 0 0 105000 49 49
2 20 RL 14267 Pave IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 6 1958 1958 Hip CompShg Wd Sdng Wd Sdng 108.0 TA TA CBlock 923.0 0.0 406.0 1329.0 GasA TA Y 1329 0 0 1329 0.0 0.0 1 1 3 1 Gd 6 Typ 0 1.0 312.0 Y 393 36 0 0 0 0 12500 172000 52 52
3 20 RL 11160 Pave Reg Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 7 5 1968 1968 Hip CompShg BrkFace BrkFace 0.0 Gd TA CBlock 1065.0 0.0 1045.0 2110.0 GasA Ex Y 2110 0 0 2110 1.0 0.0 2 1 3 1 Ex 8 Typ 2 2.0 522.0 Y 0 0 0 0 0 0 0 244000 42 42
4 60 RL 13830 Pave IR1 Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 2Story 5 5 1997 1998 Gable CompShg VinylSd VinylSd 0.0 TA TA PConc 791.0 0.0 137.0 928.0 GasA Gd Y 928 701 0 1629 0.0 0.0 2 1 3 1 TA 6 Typ 1 2.0 482.0 Y 212 34 0 0 0 0 0 189900 13 12
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2925 80 RL 7937 Pave IR1 Lvl AllPub CulDSac Gtl Mitchel Norm Norm 1Fam SLvl 6 6 1984 1984 Gable CompShg HdBoard HdBoard 0.0 TA TA CBlock 819.0 0.0 184.0 1003.0 GasA TA Y 1003 0 0 1003 1.0 0.0 1 0 3 1 TA 6 Typ 0 2.0 588.0 Y 120 0 0 0 0 0 0 142500 22 22
2926 20 RL 8885 Pave IR1 Low AllPub Inside Mod Mitchel Norm Norm 1Fam 1Story 5 5 1983 1983 Gable CompShg HdBoard HdBoard 0.0 TA TA CBlock 301.0 324.0 239.0 864.0 GasA TA Y 902 0 0 902 1.0 0.0 1 0 2 1 TA 5 Typ 0 2.0 484.0 Y 164 0 0 0 0 0 0 131000 23 23
2927 85 RL 10441 Pave Reg Lvl AllPub Inside Gtl Mitchel Norm Norm 1Fam SFoyer 5 5 1992 1992 Gable CompShg HdBoard Wd Shng 0.0 TA TA PConc 337.0 0.0 575.0 912.0 GasA TA Y 970 0 0 970 0.0 1.0 1 0 3 1 TA 6 Typ 0 0.0 0.0 Y 80 32 0 0 0 0 700 132000 14 14
2928 20 RL 10010 Pave Reg Lvl AllPub Inside Mod Mitchel Norm Norm 1Fam 1Story 5 5 1974 1975 Gable CompShg HdBoard HdBoard 0.0 TA TA CBlock 1071.0 123.0 195.0 1389.0 GasA Gd Y 1389 0 0 1389 1.0 0.0 1 0 2 1 TA 6 Typ 1 2.0 418.0 Y 240 38 0 0 0 0 0 170000 32 31
2929 60 RL 9627 Pave Reg Lvl AllPub Inside Mod Mitchel Norm Norm 1Fam 2Story 7 5 1993 1994 Gable CompShg HdBoard HdBoard 94.0 TA TA PConc 758.0 0.0 238.0 996.0 GasA Ex Y 996 1004 0 2000 0.0 0.0 2 1 3 1 TA 9 Typ 1 3.0 650.0 Y 190 48 0 0 0 0 0 188000 13 12

2689 rows × 60 columns

# recheck if there is any missing value
any(clean_df.isnull().sum() > 0)
False

Feature Selection

sel_df = select_features(clean_df, 0.4)
sel_df
SalePrice Overall Qual Gr Liv Area Total Bsmt SF 1st Flr SF Garage Cars Garage Area Full Bath Year Built Year Since Sold Year Since Re Year Remod/Add Fireplaces TotRms AbvGrd Mas Vnr Area BsmtFin SF 1
0 215000 6 1656 1080.0 1656 2.0 528.0 1 1960 50 50 1960 2 7 112.0 639.0
1 105000 5 896 882.0 896 1.0 730.0 1 1961 49 49 1961 0 5 0.0 468.0
2 172000 6 1329 1329.0 1329 1.0 312.0 1 1958 52 52 1958 0 6 108.0 923.0
3 244000 7 2110 2110.0 2110 2.0 522.0 2 1968 42 42 1968 2 8 0.0 1065.0
4 189900 5 1629 928.0 928 2.0 482.0 2 1997 13 12 1998 1 6 0.0 791.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2925 142500 6 1003 1003.0 1003 2.0 588.0 1 1984 22 22 1984 0 6 0.0 819.0
2926 131000 5 902 864.0 902 2.0 484.0 1 1983 23 23 1983 0 5 0.0 301.0
2927 132000 5 970 912.0 970 0.0 0.0 1 1992 14 14 1992 0 6 0.0 337.0
2928 170000 5 1389 1389.0 1389 2.0 418.0 1 1974 32 31 1975 1 6 0.0 1071.0
2929 188000 7 2000 996.0 996 3.0 650.0 2 1993 13 12 1994 1 9 94.0 758.0

2689 rows × 16 columns

# make a dictionary of value counts for all columns in clean_df (the df resulted from initial aggregation)
holder = dict()
for i in clean_df.columns:
    holder[i] = len(clean_df[i].value_counts())
holder
{'MS SubClass': 16,
 'MS Zoning': 7,
 'Lot Area': 1795,
 'Street': 2,
 'Lot Shape': 4,
 'Land Contour': 4,
 'Utilities': 3,
 'Lot Config': 5,
 'Land Slope': 3,
 'Neighborhood': 28,
 'Condition 1': 9,
 'Condition 2': 8,
 'Bldg Type': 5,
 'House Style': 8,
 'Overall Qual': 10,
 'Overall Cond': 9,
 'Year Built': 117,
 'Year Remod/Add': 60,
 'Roof Style': 6,
 'Roof Matl': 7,
 'Exterior 1st': 16,
 'Exterior 2nd': 16,
 'Mas Vnr Area': 413,
 'Exter Qual': 4,
 'Exter Cond': 5,
 'Foundation': 6,
 'BsmtFin SF 1': 942,
 'BsmtFin SF 2': 273,
 'Bsmt Unf SF': 1060,
 'Total Bsmt SF': 978,
 'Heating': 6,
 'Heating QC': 5,
 'Central Air': 2,
 '1st Flr SF': 1028,
 '2nd Flr SF': 597,
 'Low Qual Fin SF': 36,
 'Gr Liv Area': 1226,
 'Bsmt Full Bath': 4,
 'Bsmt Half Bath': 3,
 'Full Bath': 5,
 'Half Bath': 3,
 'Bedroom AbvGr': 8,
 'Kitchen AbvGr': 4,
 'Kitchen Qual': 5,
 'TotRms AbvGrd': 13,
 'Functional': 8,
 'Fireplaces': 5,
 'Garage Cars': 6,
 'Garage Area': 570,
 'Paved Drive': 3,
 'Wood Deck SF': 376,
 'Open Porch SF': 238,
 'Enclosed Porch': 181,
 '3Ssn Porch': 27,
 'Screen Porch': 119,
 'Pool Area': 13,
 'Misc Val': 37,
 'SalePrice': 864,
 'Year Since Sold': 126,
 'Year Since Re': 60}
# nominal columns given by the data set description
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]
# choose only nominal features that have at most 10 unique values
to_cat = list()
for i in holder.keys():
    if (i in nominal_features) & (holder[i] <= 10):
        to_cat.append(i)
to_cat
['MS Zoning',
 'Street',
 'Land Contour',
 'Lot Config',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Roof Style',
 'Roof Matl',
 'Foundation',
 'Heating',
 'Central Air']
# categorize these nominal features and make corresponding dummies
for i in to_cat:
    sel_df[i] = clean_df[i].astype('category')
    sel_df = pd.concat([sel_df, 
                        pd.get_dummies(sel_df.select_dtypes(include=['category']))], axis=1).drop(i, axis=1)
/Users/MimiHMB/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
sel_df
SalePrice Overall Qual Gr Liv Area Total Bsmt SF 1st Flr SF Garage Cars Garage Area Full Bath Year Built Year Since Sold Year Since Re Year Remod/Add Fireplaces TotRms AbvGrd Mas Vnr Area BsmtFin SF 1 MS Zoning_A (agr) MS Zoning_C (all) MS Zoning_FV MS Zoning_I (all) MS Zoning_RH MS Zoning_RL MS Zoning_RM Street_Grvl Street_Pave Land Contour_Bnk Land Contour_HLS Land Contour_Low Land Contour_Lvl Lot Config_Corner Lot Config_CulDSac Lot Config_FR2 Lot Config_FR3 Lot Config_Inside Condition 1_Artery Condition 1_Feedr Condition 1_Norm Condition 1_PosA Condition 1_PosN Condition 1_RRAe Condition 1_RRAn Condition 1_RRNe Condition 1_RRNn Condition 2_Artery Condition 2_Feedr Condition 2_Norm Condition 2_PosA Condition 2_PosN Condition 2_RRAe Condition 2_RRAn Condition 2_RRNn Bldg Type_1Fam Bldg Type_2fmCon Bldg Type_Duplex Bldg Type_Twnhs Bldg Type_TwnhsE House Style_1.5Fin House Style_1.5Unf House Style_1Story House Style_2.5Fin House Style_2.5Unf House Style_2Story House Style_SFoyer House Style_SLvl Roof Style_Flat Roof Style_Gable Roof Style_Gambrel Roof Style_Hip Roof Style_Mansard Roof Style_Shed Roof Matl_CompShg Roof Matl_Membran Roof Matl_Metal Roof Matl_Roll Roof Matl_Tar&Grv Roof Matl_WdShake Roof Matl_WdShngl Foundation_BrkTil Foundation_CBlock Foundation_PConc Foundation_Slab Foundation_Stone Foundation_Wood Heating_Floor Heating_GasA Heating_GasW Heating_Grav Heating_OthW Heating_Wall Central Air_N Central Air_Y
0 215000 6 1656 1080.0 1656 2.0 528.0 1 1960 50 50 1960 2 7 112.0 639.0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
1 105000 5 896 882.0 896 1.0 730.0 1 1961 49 49 1961 0 5 0.0 468.0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
2 172000 6 1329 1329.0 1329 1.0 312.0 1 1958 52 52 1958 0 6 108.0 923.0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
3 244000 7 2110 2110.0 2110 2.0 522.0 2 1968 42 42 1968 2 8 0.0 1065.0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
4 189900 5 1629 928.0 928 2.0 482.0 2 1997 13 12 1998 1 6 0.0 791.0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2925 142500 6 1003 1003.0 1003 2.0 588.0 1 1984 22 22 1984 0 6 0.0 819.0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
2926 131000 5 902 864.0 902 2.0 484.0 1 1983 23 23 1983 0 5 0.0 301.0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
2927 132000 5 970 912.0 970 0.0 0.0 1 1992 14 14 1992 0 6 0.0 337.0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1
2928 170000 5 1389 1389.0 1389 2.0 418.0 1 1974 32 31 1975 1 6 0.0 1071.0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
2929 188000 7 2000 996.0 996 3.0 650.0 2 1993 13 12 1994 1 9 94.0 758.0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1

2689 rows × 91 columns

Multivariate Linear Regression & Cross Validation

x = list(range(0, 30))
y = [train_and_test(sel_df, i) for i in x]

Lowest RMSE achieved at 30 folds

plt.plot(x, y)
plt.xlabel('k fold')
plt.ylabel('RMSE')
plt.show()

png