Data Engineer & DataOps
My LinkedIn
My GitHub
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
data = pd.read_csv('Placement_Data_Full_Class.csv')
data = data.iloc[:, 1:]
data
gender | ssc_p | ssc_b | hsc_p | hsc_b | hsc_s | degree_p | degree_t | workex | etest_p | specialisation | mba_p | status | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | M | 67.00 | Others | 91.00 | Others | Commerce | 58.00 | Sci&Tech | No | 55.0 | Mkt&HR | 58.80 | Placed | 270000.0 |
1 | M | 79.33 | Central | 78.33 | Others | Science | 77.48 | Sci&Tech | Yes | 86.5 | Mkt&Fin | 66.28 | Placed | 200000.0 |
2 | M | 65.00 | Central | 68.00 | Central | Arts | 64.00 | Comm&Mgmt | No | 75.0 | Mkt&Fin | 57.80 | Placed | 250000.0 |
3 | M | 56.00 | Central | 52.00 | Central | Science | 52.00 | Sci&Tech | No | 66.0 | Mkt&HR | 59.43 | Not Placed | NaN |
4 | M | 85.80 | Central | 73.60 | Central | Commerce | 73.30 | Comm&Mgmt | No | 96.8 | Mkt&Fin | 55.50 | Placed | 425000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
210 | M | 80.60 | Others | 82.00 | Others | Commerce | 77.60 | Comm&Mgmt | No | 91.0 | Mkt&Fin | 74.49 | Placed | 400000.0 |
211 | M | 58.00 | Others | 60.00 | Others | Science | 72.00 | Sci&Tech | No | 74.0 | Mkt&Fin | 53.62 | Placed | 275000.0 |
212 | M | 67.00 | Others | 67.00 | Others | Commerce | 73.00 | Comm&Mgmt | Yes | 59.0 | Mkt&Fin | 69.72 | Placed | 295000.0 |
213 | F | 74.00 | Others | 66.00 | Others | Commerce | 58.00 | Comm&Mgmt | No | 70.0 | Mkt&HR | 60.23 | Placed | 204000.0 |
214 | M | 62.00 | Central | 58.00 | Others | Science | 53.00 | Comm&Mgmt | No | 89.0 | Mkt&HR | 60.22 | Not Placed | NaN |
215 rows × 14 columns
o = list(data.select_dtypes(include='object').columns)
num_data = data.copy().drop(o, axis=1)
txt_data = data.loc[:, o]
for i in o:
txt_data[i] = data[i].astype('category')
txt_data = pd.concat([txt_data,
pd.get_dummies(txt_data.select_dtypes(include=['category']))], axis=1).drop(i, axis=1)
trans_data = pd.concat([txt_data, num_data], axis=1)
trans_data.drop('status_Not Placed', axis=1, inplace=True)
trans_data
gender_F | gender_M | ssc_b_Central | ssc_b_Others | hsc_b_Central | hsc_b_Others | hsc_s_Arts | hsc_s_Commerce | hsc_s_Science | degree_t_Comm&Mgmt | ... | workex_Yes | specialisation_Mkt&Fin | specialisation_Mkt&HR | status_Placed | ssc_p | hsc_p | degree_p | etest_p | mba_p | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 67.00 | 91.00 | 58.00 | 55.0 | 58.80 | 270000.0 |
1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 1 | 0 | 1 | 79.33 | 78.33 | 77.48 | 86.5 | 66.28 | 200000.0 |
2 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 1 | 65.00 | 68.00 | 64.00 | 75.0 | 57.80 | 250000.0 |
3 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 56.00 | 52.00 | 52.00 | 66.0 | 59.43 | NaN |
4 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 1 | 85.80 | 73.60 | 73.30 | 96.8 | 55.50 | 425000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
210 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 1 | 80.60 | 82.00 | 77.60 | 91.0 | 74.49 | 400000.0 |
211 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 1 | 58.00 | 60.00 | 72.00 | 74.0 | 53.62 | 275000.0 |
212 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | ... | 1 | 1 | 0 | 1 | 67.00 | 67.00 | 73.00 | 59.0 | 69.72 | 295000.0 |
213 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 1 | 1 | 74.00 | 66.00 | 58.00 | 70.0 | 60.23 | 204000.0 |
214 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 62.00 | 58.00 | 53.00 | 89.0 | 60.22 | NaN |
215 rows × 23 columns
task_1_data = pd.concat([trans_data.iloc[:, -6:-1], trans_data.iloc[:, :-6]], axis=1)
task_1_data
ssc_p | hsc_p | degree_p | etest_p | mba_p | gender_F | gender_M | ssc_b_Central | ssc_b_Others | hsc_b_Central | ... | hsc_s_Commerce | hsc_s_Science | degree_t_Comm&Mgmt | degree_t_Others | degree_t_Sci&Tech | workex_No | workex_Yes | specialisation_Mkt&Fin | specialisation_Mkt&HR | status_Placed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 67.00 | 91.00 | 58.00 | 55.0 | 58.80 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 |
1 | 79.33 | 78.33 | 77.48 | 86.5 | 66.28 | 0 | 1 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 |
2 | 65.00 | 68.00 | 64.00 | 75.0 | 57.80 | 0 | 1 | 1 | 0 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
3 | 56.00 | 52.00 | 52.00 | 66.0 | 59.43 | 0 | 1 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
4 | 85.80 | 73.60 | 73.30 | 96.8 | 55.50 | 0 | 1 | 1 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
210 | 80.60 | 82.00 | 77.60 | 91.0 | 74.49 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
211 | 58.00 | 60.00 | 72.00 | 74.0 | 53.62 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 |
212 | 67.00 | 67.00 | 73.00 | 59.0 | 69.72 | 0 | 1 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
213 | 74.00 | 66.00 | 58.00 | 70.0 | 60.23 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
214 | 62.00 | 58.00 | 53.00 | 89.0 | 60.22 | 0 | 1 | 1 | 0 | 0 | ... | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
215 rows × 22 columns
# comfirm if imbalanced
task_1_data['status_Placed'].value_counts(normalize=True)
1 0.688372
0 0.311628
Name: status_Placed, dtype: float64
X_train, X_test, y_train, y_test = train_test_split(task_1_data.iloc[:, :-1], task_1_data.iloc[:, -1], random_state=42)
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])
param_grid = [{'classifier': [SVC()], 'preprocessing': [StandardScaler()],
'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
'classifier__probability': [True],
'classifier__class_weight': ['balanced']}]
grid = GridSearchCV(pipe, param_grid, cv=10)
grid.fit(X_train, y_train)
print(grid.best_params_)
{'classifier': SVC(C=10, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
verbose=False), 'classifier__C': 10, 'classifier__class_weight': 'balanced', 'classifier__gamma': 0.01, 'classifier__probability': True, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', grid.best_params_['classifier'])])
pipe.fit(X_train, y_train)
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))
0.9192546583850931
0.8703703703703703
cross_val_score(pipe, task_1_data.iloc[:, :-1], task_1_data.iloc[:, -1], scoring="f1")
array([0.8852459 , 0.87096774, 0.92857143, 0.83333333, 0.84615385])
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
y_probabilities = pipe.predict_proba(X_test)[:,1]
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_probabilities)
# ROC curve
plt.title("Receiver Operating Characteristic")
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([1, 1], ls='--', c='black')
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.legend([])
plt.show()
# area under roc line
roc_auc_score(y_test, y_probabilities)
0.9446428571428571
# support vectors
for i in range(0, 5):
print(task_1_data.columns[pipe[1].support_[i]])
hsc_p
mba_p
gender_M
ssc_b_Central
ssc_b_Others
task_2_data = pd.read_csv('Admission_Predict_Ver1.1.csv', index_col='Serial No.')
X, y = task_2_data.iloc[:, :-1], task_2_data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
corr_cols = task_2_data.corr()['Chance of Admit '].sort_values(ascending=False)
# high correlation between columns
corr_cols[1:]
CGPA 0.882413
GRE Score 0.810351
TOEFL Score 0.792228
University Rating 0.690132
SOP 0.684137
LOR 0.645365
Research 0.545871
Name: Chance of Admit , dtype: float64
from sklearn.svm import SVR
pipe = make_pipeline(StandardScaler(), SVR())
param_grid = [{'svr__kernel': ['rbf'],
'svr__C': [0.001, 0.01, 0.1, 1, 10, 100],
'svr__gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
{'svr__kernel': ['linear'],
'svr__C': [0.001, 0.01, 0.1, 1, 10, 100]}]
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
{'svr__C': 100, 'svr__kernel': 'linear'}
pipe = make_pipeline(StandardScaler(), SVR(C=100, kernel='linear'))
pipe.fit(X_train, y_train)
print(mean_squared_error(y_test, pipe.predict(X_test)))
0.004247343408697258