My Bui (Mimi)

Data Engineer & DataOps

My LinkedIn
My GitHub

Support Vector Machine: classification of graduation and regression of admission data

Goals

1. Classification - job placement status after graduation: score = 0.8703

2. Regression - chance of admission: MSE = 0.0042

import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score 
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
data = pd.read_csv('Placement_Data_Full_Class.csv')
data = data.iloc[:, 1:]
data
gender ssc_p ssc_b hsc_p hsc_b hsc_s degree_p degree_t workex etest_p specialisation mba_p status salary
0 M 67.00 Others 91.00 Others Commerce 58.00 Sci&Tech No 55.0 Mkt&HR 58.80 Placed 270000.0
1 M 79.33 Central 78.33 Others Science 77.48 Sci&Tech Yes 86.5 Mkt&Fin 66.28 Placed 200000.0
2 M 65.00 Central 68.00 Central Arts 64.00 Comm&Mgmt No 75.0 Mkt&Fin 57.80 Placed 250000.0
3 M 56.00 Central 52.00 Central Science 52.00 Sci&Tech No 66.0 Mkt&HR 59.43 Not Placed NaN
4 M 85.80 Central 73.60 Central Commerce 73.30 Comm&Mgmt No 96.8 Mkt&Fin 55.50 Placed 425000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
210 M 80.60 Others 82.00 Others Commerce 77.60 Comm&Mgmt No 91.0 Mkt&Fin 74.49 Placed 400000.0
211 M 58.00 Others 60.00 Others Science 72.00 Sci&Tech No 74.0 Mkt&Fin 53.62 Placed 275000.0
212 M 67.00 Others 67.00 Others Commerce 73.00 Comm&Mgmt Yes 59.0 Mkt&Fin 69.72 Placed 295000.0
213 F 74.00 Others 66.00 Others Commerce 58.00 Comm&Mgmt No 70.0 Mkt&HR 60.23 Placed 204000.0
214 M 62.00 Central 58.00 Others Science 53.00 Comm&Mgmt No 89.0 Mkt&HR 60.22 Not Placed NaN

215 rows × 14 columns

Data pre-processing: tranform categorial data into numerical data

o = list(data.select_dtypes(include='object').columns)
num_data = data.copy().drop(o, axis=1)
txt_data = data.loc[:, o]
for i in o:
    txt_data[i] = data[i].astype('category')
    txt_data = pd.concat([txt_data, 
                        pd.get_dummies(txt_data.select_dtypes(include=['category']))], axis=1).drop(i, axis=1)
trans_data = pd.concat([txt_data, num_data], axis=1)
trans_data.drop('status_Not Placed', axis=1, inplace=True)
trans_data
gender_F gender_M ssc_b_Central ssc_b_Others hsc_b_Central hsc_b_Others hsc_s_Arts hsc_s_Commerce hsc_s_Science degree_t_Comm&Mgmt ... workex_Yes specialisation_Mkt&Fin specialisation_Mkt&HR status_Placed ssc_p hsc_p degree_p etest_p mba_p salary
0 0 1 0 1 0 1 0 1 0 0 ... 0 0 1 1 67.00 91.00 58.00 55.0 58.80 270000.0
1 0 1 1 0 0 1 0 0 1 0 ... 1 1 0 1 79.33 78.33 77.48 86.5 66.28 200000.0
2 0 1 1 0 1 0 1 0 0 1 ... 0 1 0 1 65.00 68.00 64.00 75.0 57.80 250000.0
3 0 1 1 0 1 0 0 0 1 0 ... 0 0 1 0 56.00 52.00 52.00 66.0 59.43 NaN
4 0 1 1 0 1 0 0 1 0 1 ... 0 1 0 1 85.80 73.60 73.30 96.8 55.50 425000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
210 0 1 0 1 0 1 0 1 0 1 ... 0 1 0 1 80.60 82.00 77.60 91.0 74.49 400000.0
211 0 1 0 1 0 1 0 0 1 0 ... 0 1 0 1 58.00 60.00 72.00 74.0 53.62 275000.0
212 0 1 0 1 0 1 0 1 0 1 ... 1 1 0 1 67.00 67.00 73.00 59.0 69.72 295000.0
213 1 0 0 1 0 1 0 1 0 1 ... 0 0 1 1 74.00 66.00 58.00 70.0 60.23 204000.0
214 0 1 1 0 0 1 0 0 1 1 ... 0 0 1 0 62.00 58.00 53.00 89.0 60.22 NaN

215 rows × 23 columns

1. Classification: job placement status after graduation

task_1_data = pd.concat([trans_data.iloc[:, -6:-1], trans_data.iloc[:, :-6]], axis=1)
task_1_data
ssc_p hsc_p degree_p etest_p mba_p gender_F gender_M ssc_b_Central ssc_b_Others hsc_b_Central ... hsc_s_Commerce hsc_s_Science degree_t_Comm&Mgmt degree_t_Others degree_t_Sci&Tech workex_No workex_Yes specialisation_Mkt&Fin specialisation_Mkt&HR status_Placed
0 67.00 91.00 58.00 55.0 58.80 0 1 0 1 0 ... 1 0 0 0 1 1 0 0 1 1
1 79.33 78.33 77.48 86.5 66.28 0 1 1 0 0 ... 0 1 0 0 1 0 1 1 0 1
2 65.00 68.00 64.00 75.0 57.80 0 1 1 0 1 ... 0 0 1 0 0 1 0 1 0 1
3 56.00 52.00 52.00 66.0 59.43 0 1 1 0 1 ... 0 1 0 0 1 1 0 0 1 0
4 85.80 73.60 73.30 96.8 55.50 0 1 1 0 1 ... 1 0 1 0 0 1 0 1 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
210 80.60 82.00 77.60 91.0 74.49 0 1 0 1 0 ... 1 0 1 0 0 1 0 1 0 1
211 58.00 60.00 72.00 74.0 53.62 0 1 0 1 0 ... 0 1 0 0 1 1 0 1 0 1
212 67.00 67.00 73.00 59.0 69.72 0 1 0 1 0 ... 1 0 1 0 0 0 1 1 0 1
213 74.00 66.00 58.00 70.0 60.23 1 0 0 1 0 ... 1 0 1 0 0 1 0 0 1 1
214 62.00 58.00 53.00 89.0 60.22 0 1 1 0 0 ... 0 1 1 0 0 1 0 0 1 0

215 rows × 22 columns

Model selection for imbalanced data

# comfirm if imbalanced
task_1_data['status_Placed'].value_counts(normalize=True)
1    0.688372
0    0.311628
Name: status_Placed, dtype: float64
X_train, X_test, y_train, y_test = train_test_split(task_1_data.iloc[:, :-1], task_1_data.iloc[:, -1], random_state=42)
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])
param_grid = [{'classifier': [SVC()], 'preprocessing': [StandardScaler()],
             'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
             'classifier__probability': [True],
             'classifier__class_weight': ['balanced']}]

grid = GridSearchCV(pipe, param_grid, cv=10)
grid.fit(X_train, y_train)
print(grid.best_params_)
{'classifier': SVC(C=10, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False), 'classifier__C': 10, 'classifier__class_weight': 'balanced', 'classifier__gamma': 0.01, 'classifier__probability': True, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Perform model and check scores

pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', grid.best_params_['classifier'])])
pipe.fit(X_train, y_train)
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))
0.9192546583850931
0.8703703703703703
cross_val_score(pipe, task_1_data.iloc[:, :-1], task_1_data.iloc[:, -1], scoring="f1")
array([0.8852459 , 0.87096774, 0.92857143, 0.83333333, 0.84615385])
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve

y_probabilities = pipe.predict_proba(X_test)[:,1]
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_probabilities)

# ROC curve
plt.title("Receiver Operating Characteristic") 
plt.plot(false_positive_rate, true_positive_rate) 
plt.plot([1, 1], ls='--', c='black') 
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.legend([])
plt.show()

png

# area under roc line
roc_auc_score(y_test, y_probabilities)
0.9446428571428571
# support vectors
for i in range(0, 5):
    print(task_1_data.columns[pipe[1].support_[i]])
hsc_p
mba_p
gender_M
ssc_b_Central
ssc_b_Others

2. Regression: chance of admission

task_2_data = pd.read_csv('Admission_Predict_Ver1.1.csv', index_col='Serial No.')
X, y = task_2_data.iloc[:, :-1], task_2_data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
corr_cols = task_2_data.corr()['Chance of Admit '].sort_values(ascending=False)
# high correlation between columns
corr_cols[1:]
CGPA                 0.882413
GRE Score            0.810351
TOEFL Score          0.792228
University Rating    0.690132
SOP                  0.684137
LOR                  0.645365
Research             0.545871
Name: Chance of Admit , dtype: float64

Model selection

from sklearn.svm import SVR

pipe = make_pipeline(StandardScaler(), SVR())

param_grid = [{'svr__kernel': ['rbf'],
               'svr__C': [0.001, 0.01, 0.1, 1, 10, 100],
               'svr__gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
              {'svr__kernel': ['linear'],
               'svr__C': [0.001, 0.01, 0.1, 1, 10, 100]}]

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
{'svr__C': 100, 'svr__kernel': 'linear'}

Perform model and check mse score

pipe = make_pipeline(StandardScaler(), SVR(C=100, kernel='linear'))
pipe.fit(X_train, y_train)
print(mean_squared_error(y_test, pipe.predict(X_test)))
0.004247343408697258