# Installing the libraries with the specified version.
!pip install numpy==1.25.2 pandas==1.5.3 scikit-learn==1.5.2 matplotlib==3.7.1 seaborn==0.13.1 xgboost==2.0.3 -q --user

# Install XgBoost
!pip install xgboost

Requirement already satisfied: xgboost in /usr/local/lib/python3.12/dist-packages (3.0.4)
Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.0.2)
Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.27.3)
Requirement already satisfied: scipy in /usr/local/lib/python3.12/dist-packages (from xgboost) (1.16.1)

import warnings

warnings.filterwarnings("ignore")

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Library to split data
from sklearn.model_selection import train_test_split

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score


# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 100)


# Libraries different ensemble classifiers
from sklearn.ensemble import (
    BaggingClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
)

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Libraries to get different metric scores
from sklearn import metrics
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# To tune different models
from sklearn.model_selection import (
    RandomizedSearchCV,
    GridSearchCV
)

# Run the following lines for Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Read the data
visa = pd.read_csv('/content/drive/MyDrive/Advanced Machine Learning/EasyVisa project/EasyVisa.csv')

# Copying data to another variable to avoid any changes to original data
data = visa.copy()

# The top 5 rows of the data
data.head()

# The last 5 rows of the data
data.tail()

data.shape

(25480, 12)

# print the shape of the dataset (rows and cols)
print(f" The data has {data.shape[0]} rows  \n And has {data.shape[1]} columns")

 The data has 25480 rows  
 And has 12 columns

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   case_id                25480 non-null  object 
 1   continent              25480 non-null  object 
 2   education_of_employee  25480 non-null  object 
 3   has_job_experience     25480 non-null  object 
 4   requires_job_training  25480 non-null  object 
 5   no_of_employees        25480 non-null  int64  
 6   yr_of_estab            25480 non-null  int64  
 7   region_of_employment   25480 non-null  object 
 8   prevailing_wage        25480 non-null  float64
 9   unit_of_wage           25480 non-null  object 
 10  full_time_position     25480 non-null  object 
 11  case_status            25480 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB

# Checking for duplicate values
data.duplicated().sum()

np.int64(0)

data.describe(include='all').T

# The negative values rows in the employee column
data.loc[data['no_of_employees'] < 0]

# Checking the shape (number of rows and columns) of all negative values (< 0)
data.loc[data['no_of_employees'] < 0].shape

(33, 12)

# Fix negatives by taking absolute value
data["no_of_employees"] = data["no_of_employees"].abs()

# Checking the shape (number of rows and columns) of all negative values (< 0) again
data.loc[data['no_of_employees'] < 0].shape

(0, 12)

# Checking the statistical summary for no_of_employees, yr_of_estab, and prevailling_wage again
data.describe().T

# Making a list of all catrgorical variables
cat_col = list(data.select_dtypes("object").columns)

# Printing number of count of each unique value in each column
for column in cat_col:
    print(data[column].value_counts())
    print(f" Total {data[column].value_counts().sum()}")
    print("-" * 50)

case_id
EZYV25480    1
EZYV01       1
EZYV02       1
EZYV03       1
EZYV04       1
            ..
EZYV13       1
EZYV12       1
EZYV11       1
EZYV10       1
EZYV09       1
Name: count, Length: 25480, dtype: int64
 Total 25480
--------------------------------------------------
continent
Asia             16861
Europe            3732
North America     3292
South America      852
Africa             551
Oceania            192
Name: count, dtype: int64
 Total 25480
--------------------------------------------------
education_of_employee
Bachelor's     10234
Master's        9634
High School     3420
Doctorate       2192
Name: count, dtype: int64
 Total 25480
--------------------------------------------------
has_job_experience
Y    14802
N    10678
Name: count, dtype: int64
 Total 25480
--------------------------------------------------
requires_job_training
N    22525
Y     2955
Name: count, dtype: int64
 Total 25480
--------------------------------------------------
region_of_employment
Northeast    7195
South        7017
West         6586
Midwest      4307
Island        375
Name: count, dtype: int64
 Total 25480
--------------------------------------------------
unit_of_wage
Year     22962
Hour      2157
Week       272
Month       89
Name: count, dtype: int64
 Total 25480
--------------------------------------------------
full_time_position
Y    22773
N     2707
Name: count, dtype: int64
 Total 25480
--------------------------------------------------
case_status
Certified    17018
Denied        8462
Name: count, dtype: int64
 Total 25480
--------------------------------------------------

## Drop 'case_id' column from the data
data.drop(["case_id"], axis=1, inplace=True)

def histogram_boxplot(data, feature, figsize=(15, 10), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (15,10))
    kde: whether to show the density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a triangle will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

# function to create labeled barplots


def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot

labeled_barplot(data, "education_of_employee", perc=True)

labeled_barplot(data, 'region_of_employment', perc = True)

labeled_barplot(data, 'has_job_experience', perc = True)

labeled_barplot(data, 'case_status', perc = True)

# The correlation between the variables
cols_list = data.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(10, 5))
sns.heatmap(
    data[cols_list].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral"
)
plt.show()

### function to plot distributions wrt target


def distribution_plot_wrt_target(data, predictor, target):

    fig, axs = plt.subplots(2, 2, figsize=(12, 10))

    target_uniq = data[target].unique()

    axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
    sns.histplot(
        data=data[data[target] == target_uniq[0]],
        x=predictor,
        kde=True,
        ax=axs[0, 0],
        color="teal",
        stat="density",
    )

    axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
    sns.histplot(
        data=data[data[target] == target_uniq[1]],
        x=predictor,
        kde=True,
        ax=axs[0, 1],
        color="orange",
        stat="density",
    )

    axs[1, 0].set_title("Boxplot w.r.t target")
    sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")

    axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
    sns.boxplot(
        data=data,
        x=target,
        y=predictor,
        ax=axs[1, 1],
        showfliers=False,
        palette="gist_rainbow",
    )

    plt.tight_layout()
    plt.show()

def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 120)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()

stacked_barplot(data, "education_of_employee", "case_status")

case_status            Certified  Denied    All
education_of_employee                          
All                        17018    8462  25480
Bachelor's                  6367    3867  10234
High School                 1164    2256   3420
Master's                    7575    2059   9634
Doctorate                   1912     280   2192
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, "continent", "case_status")

case_status    Certified  Denied    All
continent                              
All                17018    8462  25480
Asia               11012    5849  16861
North America       2037    1255   3292
Europe              2957     775   3732
South America        493     359    852
Africa               397     154    551
Oceania              122      70    192
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'has_job_experience', 'case_status')

case_status         Certified  Denied    All
has_job_experience                          
All                     17018    8462  25480
N                        5994    4684  10678
Y                       11024    3778  14802
------------------------------------------------------------------------------------------------------------------------

plt.figure(figsize=(10, 5))
sns.boxplot(x='region_of_employment', y='prevailing_wage', data=data)
plt.show()

distribution_plot_wrt_target(data, 'prevailing_wage', 'case_status')

stacked_barplot(data, 'unit_of_wage', 'case_status')

case_status   Certified  Denied    All
unit_of_wage                          
All               17018    8462  25480
Year              16047    6915  22962
Hour                747    1410   2157
Week                169     103    272
Month                55      34     89
------------------------------------------------------------------------------------------------------------------------

# outlier detection using boxplot
numeric_columns = data.select_dtypes(include=np.number).columns.tolist()


plt.figure(figsize=(15, 12))

for i, variable in enumerate(numeric_columns):
    plt.subplot(4, 4, i + 1)
    plt.boxplot(data[variable], whis=1.5)
    plt.tight_layout()
    plt.title(variable)

plt.show()

data["case_status"] = data["case_status"].apply(lambda x: 1 if x == "Certified" else 0)
data.head()

## Let's drop case status from the data
X = data.drop(["case_status"], axis=1)
y = data["case_status"]

## Let's create dummies
X = pd.get_dummies(X, drop_first=True)

## Let's split the data into train and valid in the ratio 70:30
# Splitting data in train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

## Let's split the data into train and test in the ratio 90:10
# Splitting data in valid and test sets
X_val,X_test,y_val,y_test = train_test_split(X_val,y_val,test_size=0.1,random_state=1,stratify=y_val)

print("Shape of Training set : ", X_train.shape)
print("Shape of Validation set : ", X_val.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in validation set:")
print(y_val.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))

Shape of Training set :  (17836, 21)
Shape of Validation set :  (6879, 21)
Shape of test set :  (765, 21)
Percentage of classes in training set:
case_status
1    0.667919
0    0.332081
Name: proportion, dtype: float64
Percentage of classes in validation set:
case_status
1    0.66783
0    0.33217
Name: proportion, dtype: float64
Percentage of classes in test set:
case_status
1    0.667974
0    0.332026
Name: proportion, dtype: float64

# defining a function to compute different metrics to check performance of a classification model built using sklearn


def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
        index=[0],
    )

    return df_perf

def confusion_matrix_sklearn(model, predictors, target):
    """
    To plot the confusion_matrix with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    y_pred = model.predict(predictors)
    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

#The metric
scorer = metrics.make_scorer(metrics.f1_score)

# List to store all the models
models = []

# Appending models into the list
models.append(("Bagging", BaggingClassifier (estimator = DecisionTreeClassifier(random_state = 1, class_weight='balanced'), random_state=1)))
models.append(("Random forest",RandomForestClassifier(random_state=1, class_weight='balanced')))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1, class_weight='balanced')))

# List to store all model's CV scores
results1 = []
# List to store name of the models
names = []


# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation performance on training dataset:" "\n")

for name, model in models:
    kfold = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1)
    cv_result = cross_val_score(estimator=model, X=X_train, y=y_train, scoring = scorer,cv=kfold)
    results1.append(cv_result)
    names.append(name)
    print("{}: {}".format(name, cv_result.mean()))

print("\n" "Validation Performance:" "\n")

for name, model in models:
    model.fit(X_train, y_train)
    scores = f1_score(y_val, model.predict(X_val))
    print("{}: {}".format(name, scores))

Cross-Validation performance on training dataset:

Bagging: 0.7795174942234298
Random forest: 0.8045334827520751
GBM: 0.823039791269532
Adaboost: 0.8203377989495703
Xgboost: 0.8095211182586954
dtree: 0.7448931643789144

Validation Performance:

Bagging: 0.7717968157695224
Random forest: 0.8035695755940645
GBM: 0.8195818459969403
Adaboost: 0.8158053488839735
Xgboost: 0.8083918974782968
dtree: 0.7421138211382113

# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))

fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)

plt.boxplot(results1)
ax.set_xticklabels(names)

plt.show()

print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# Synthetic Minority Over Sampling Technique
sm = SMOTE(sampling_strategy= 1, k_neighbors= 5, random_state=1) ## Complete the code to set the k-nearest neighbors and sampling strategy
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)


print("After OverSampling, counts of label '1': {}".format(sum(y_train_over == 1)))
print("After OverSampling, counts of label '0': {} \n".format(sum(y_train_over == 0)))


print("After OverSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After OverSampling, the shape of train_y: {} \n".format(y_train_over.shape))

Before OverSampling, counts of label '1': 11913
Before OverSampling, counts of label '0': 5923 

After OverSampling, counts of label '1': 11913
After OverSampling, counts of label '0': 11913 

After OverSampling, the shape of train_X: (23826, 21)
After OverSampling, the shape of train_y: (23826,)

#List to store all the models
models = []

# Appending models into the list - Also added class_weight as a parameter and set it to 'balanced' on BaggingClassifier, RandomForest, and dtree
models.append(("Bagging", BaggingClassifier(estimator = DecisionTreeClassifier(random_state = 1, class_weight='balanced'), random_state=1)))
models.append(("Random forest",RandomForestClassifier(random_state=1, class_weight='balanced')))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1, class_weight='balanced')))

#List to store all model's CV scores
results1 = []
#List to store name of the models
names = []


# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation performance on training dataset:" "\n")

for name, model in models:
    kfold = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1)
    cv_result = cross_val_score(estimator=model, X=X_train_over, y=y_train_over,scoring = scorer, cv=kfold)
    results1.append(cv_result)
    names.append(name)
    print("{}: {}".format(name, cv_result.mean()))

print("\n" "Validation Performance:" "\n")

for name, model in models:
    model.fit(X_train_over, y_train_over)
    scores = f1_score(y_val, model.predict(X_val))
    print("{}: {}".format(name, scores))

Cross-Validation performance on training dataset:

Bagging: 0.7556376199513684
Random forest: 0.7938138292522969
GBM: 0.8076949280007495
Adaboost: 0.8013161599972107
Xgboost: 0.799430071068073
dtree: 0.7236738310580881

Validation Performance:

Bagging: 0.7606724176067242
Random forest: 0.7953896584540552
GBM: 0.8125259228535877
Adaboost: 0.8120255086547221
Xgboost: 0.8039950062421972
dtree: 0.7387687188019967

# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))

fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)

plt.boxplot(results1)
ax.set_xticklabels(names)

plt.show()

rus = RandomUnderSampler(random_state=1, sampling_strategy=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)


print("Before UnderSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before UnderSampling, counts of label '0': {} \n".format(sum(y_train == 0)))


print("After UnderSampling, counts of label '1': {}".format(sum(y_train_un == 1)))
print("After UnderSampling, counts of label '0': {} \n".format(sum(y_train_un == 0)))


print("After UnderSampling, the shape of train_X: {}".format(X_train_un.shape))
print("After UnderSampling, the shape of train_y: {} \n".format(y_train_un.shape))

Before UnderSampling, counts of label '1': 11913
Before UnderSampling, counts of label '0': 5923 

After UnderSampling, counts of label '1': 5923
After UnderSampling, counts of label '0': 5923 

After UnderSampling, the shape of train_X: (11846, 21)
After UnderSampling, the shape of train_y: (11846,)

#List to store all the models
models = []

# Appending models into the list
models.append(("Bagging", BaggingClassifier(estimator = DecisionTreeClassifier(random_state = 1, class_weight='balanced'), random_state=1)))
models.append(("Random forest",RandomForestClassifier(random_state=1, class_weight='balanced')))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1, class_weight='balanced')))

#List to store all model's CV scores
results1 = []
#List to store name of the models
names = []


# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation performance on training dataset:" "\n")

for name, model in models:
    kfold = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1)
    cv_result = cross_val_score(estimator=model, X=X_train_un, y=y_train_un,scoring = scorer, cv=kfold,n_jobs =-1)
    results1.append(cv_result)
    names.append(name)
    print("{}: {}".format(name, cv_result.mean()))

print("\n" "Validation Performance:" "\n")

for name, model in models:
    model.fit(X_train_un, y_train_un)
    scores = f1_score(y_val, model.predict(X_val))
    print("{}: {}".format(name, scores))

Cross-Validation performance on training dataset:

Bagging: 0.642537896710199
Random forest: 0.6865639509715183
GBM: 0.7131358906535971
Adaboost: 0.6949405744215158
Xgboost: 0.6944693136408734
dtree: 0.6151388244082543

Validation Performance:

Bagging: 0.6916956737941323
Random forest: 0.734144015259895
GBM: 0.7608695652173914
Adaboost: 0.7604202747950584
Xgboost: 0.7423652871123688
dtree: 0.6839080459770115

# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))

fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)

plt.boxplot(results1)
ax.set_xticklabels(names)

plt.show()

%%time

# defining model
Model = AdaBoostClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = {
    "n_estimators": np.arange(30, 110, 20),
    "learning_rate": [0.01, 0.08, 0.1, 0.2, 0.65, 1],
    "estimator": [DecisionTreeClassifier(max_depth=1, random_state=1),
                  DecisionTreeClassifier(max_depth=2, random_state=1),
                  DecisionTreeClassifier(max_depth=3, random_state=1),
    ]
}


#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=50, n_jobs = -1, scoring=scorer, cv= 5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over, y_train_over)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': np.int64(90), 'learning_rate': 1, 'estimator': DecisionTreeClassifier(max_depth=2, random_state=1)} with CV score=0.7984283842220219:
CPU times: user 6.5 s, sys: 531 ms, total: 7.04 s
Wall time: 6min 39s

#The best parameters
tuned_ada = AdaBoostClassifier(n_estimators= 90, learning_rate= 1, estimator= DecisionTreeClassifier(max_depth=2, random_state=1))

tuned_ada.fit(X_train_over, y_train_over)

AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2,
                                                    random_state=1),
                   learning_rate=1, n_estimators=90)

AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2,
                                                    random_state=1),
                   learning_rate=1, n_estimators=90)

DecisionTreeClassifier(max_depth=2, random_state=1)

DecisionTreeClassifier(max_depth=2, random_state=1)

#Model performance on train set
confusion_matrix_sklearn(tuned_ada,X_train,y_train)

#Model performance on test set
confusion_matrix_sklearn(tuned_ada,X_test,y_test)

ada_train_perf = model_performance_classification_sklearn(tuned_ada, X_train_over, y_train_over)
ada_train_perf

#Check the model performance for validation data.
ada_val_perf = model_performance_classification_sklearn(tuned_ada,X_val,y_val)
ada_val_perf

%%time

# defining model
Model = RandomForestClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = {
    "n_estimators": [70, 120, 200, 250],
    "min_samples_leaf": np.arange(1, 10, 5),
    "max_features": [np.arange(3, 10, 6),'sqrt'],
    "max_samples": np.arange(0.2, 1, 0.6)}


#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=50, n_jobs = -1, scoring=scorer, cv= 5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_un, y_train_un)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': 200, 'min_samples_leaf': np.int64(6), 'max_samples': np.float64(0.2), 'max_features': 'sqrt'} with CV score=0.7219547332407472:
CPU times: user 2.02 s, sys: 135 ms, total: 2.15 s
Wall time: 1min 46s

#The best model
tuned_rf2 = RandomForestClassifier(
    max_features= 'sqrt',
    random_state= 1,
    max_samples= 0.2,
    n_estimators= 200,
    min_samples_leaf= 6,
)

tuned_rf2.fit(X_train_un, y_train_un)

RandomForestClassifier(max_samples=0.2, min_samples_leaf=6, n_estimators=200,
                       random_state=1)

RandomForestClassifier(max_samples=0.2, min_samples_leaf=6, n_estimators=200,
                       random_state=1)

#Model performance on train set
confusion_matrix_sklearn(tuned_rf2,X_train,y_train)

#Model performance on test set
confusion_matrix_sklearn(tuned_rf2,X_test,y_test)

#The model performance on the train data.
rf2_train_perf = model_performance_classification_sklearn(tuned_rf2, X_train_un, y_train_un)
rf2_train_perf

#The model performance on the validation data.
rf2_val_perf = model_performance_classification_sklearn(tuned_rf2, X_val, y_val)
rf2_val_perf

%%time

#Defining model
Model = GradientBoostingClassifier(random_state=1)

#The code to define the hyper parameters.
param_grid={"n_estimators": np.arange(100, 201, 400),
            "learning_rate": [0.01, 0.05, 0.1, 0.2],
            "subsample":[0.7, 0.8, 1.0],
            "max_features":['sqrt', 0.6,0.8]}

## Complete the code to set the cv parameter.
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, scoring=scorer, n_iter=50, n_jobs = -1, cv= 5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over, y_train_over)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'subsample': 0.7, 'n_estimators': np.int64(100), 'max_features': 0.8, 'learning_rate': 0.2} with CV score=0.8030901840578842:
CPU times: user 4.8 s, sys: 396 ms, total: 5.2 s
Wall time: 4min 58s

#The code to define the best model.
tuned_gbm = GradientBoostingClassifier(
    max_features=0.8,
    random_state=1,
    learning_rate=0.2,
    n_estimators=100,
    subsample=0.7
)

tuned_gbm.fit(X_train_over, y_train_over)

GradientBoostingClassifier(learning_rate=0.2, max_features=0.8, random_state=1,
                           subsample=0.7)

GradientBoostingClassifier(learning_rate=0.2, max_features=0.8, random_state=1,
                           subsample=0.7)

#Model performance on train set
confusion_matrix_sklearn(tuned_gbm,X_train,y_train)

#Model performance on test set
confusion_matrix_sklearn(tuned_gbm,X_test,y_test)

#The model performance on the train data.
gbm_train_perf = model_performance_classification_sklearn(tuned_gbm, X_train_over, y_train_over)
gbm_train_perf

#The model performance on the validation data.
gbm_val_perf = model_performance_classification_sklearn(tuned_gbm,X_val, y_val)
gbm_val_perf

%%time

#Defining model
Model = XGBClassifier(random_state=1,eval_metric='logloss')

#Define the hyperparameters
param_grid={'n_estimators':[100, 150, 200, 250, 300],
            'scale_pos_weight':[1, 2, 3],
            'learning_rate':[0.01, 0.05, 0.1, 0.2],
            'gamma':[0, 0.1, 0.2, 0.5],
            'subsample':[0.7, 0.8, 0.9, 1.0]}

#Set the cv parameter
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=50, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over,y_train_over)## Complete the code to fit the model on over sampled data

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'subsample': 1.0, 'scale_pos_weight': 3, 'n_estimators': 100, 'learning_rate': 0.2, 'gamma': 0} with CV score=0.813894202282847:
CPU times: user 2.52 s, sys: 274 ms, total: 2.79 s
Wall time: 2min 29s

#Define the best model
xgb2 = XGBClassifier(
    random_state=1,
    eval_metric='logloss',
    subsample=1,
    scale_pos_weight=3,
    n_estimators=100,
    learning_rate=0.2,
    gamma=0,
)

xgb2.fit(X_train_over, y_train_over)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=0,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, ...)

#Model performance on train set
confusion_matrix_sklearn(xgb2,X_train,y_train)

#Model performance on test set
confusion_matrix_sklearn(xgb2,X_test,y_test)

#The model performance on the train data.
xgb2_train_perf = model_performance_classification_sklearn(xgb2, X_train_over, y_train_over)
xgb2_train_perf

#The model performance on the validation data.
xgb2_val_perf = model_performance_classification_sklearn(xgb2,X_val, y_val)
xgb2_val_perf

#Training performance comparison

models_train_comp_df = pd.concat(
    [
        gbm_train_perf.T,
        xgb2_train_perf.T,
        ada_train_perf.T,
        rf2_train_perf.T,
    ],
    axis=1,
)
models_train_comp_df.columns = [
    "Gradient Boosting tuned with oversampled data",
    "XGBoost tuned with oversampled data",
    "AdaBoost tuned with oversampled data",
    "Random forest tuned with undersampled data",
]
print("Training performance comparison:")
models_train_comp_df

Training performance comparison:

#Validation performance comparison

models_val_comp_df = pd.concat(
    [
        gbm_val_perf.T,
        xgb2_val_perf.T,
        ada_val_perf.T,
        rf2_val_perf.T,
    ],
    axis=1,
)
models_val_comp_df.columns = [
    "Gradient Boosting tuned with oversampled data",
    "XGBoost tuned with oversampled data",
    "AdaBoost tuned with oversampled data",
    "Random forest tuned with undersampled data",
]
print("Validation performance comparison:")
models_val_comp_df

Validation performance comparison:

#The model performance on the test data by the best model.
test = model_performance_classification_sklearn(xgb2, X_test, y_test)
test

feature_names = X_train.columns
importances = xgb2.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="blue", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
case_id	25480	25480	EZYV25480	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
continent	25480	6	Asia	16861	NaN	NaN	NaN	NaN	NaN	NaN	NaN
education_of_employee	25480	4	Bachelor's	10234	NaN	NaN	NaN	NaN	NaN	NaN	NaN
has_job_experience	25480	2	Y	14802	NaN	NaN	NaN	NaN	NaN	NaN	NaN
requires_job_training	25480	2	N	22525	NaN	NaN	NaN	NaN	NaN	NaN	NaN
no_of_employees	25480.0	NaN	NaN	NaN	5667.04321	22877.928848	-26.0	1022.0	2109.0	3504.0	602069.0
yr_of_estab	25480.0	NaN	NaN	NaN	1979.409929	42.366929	1800.0	1976.0	1997.0	2005.0	2016.0
region_of_employment	25480	5	Northeast	7195	NaN	NaN	NaN	NaN	NaN	NaN	NaN
prevailing_wage	25480.0	NaN	NaN	NaN	74455.814592	52815.942327	2.1367	34015.48	70308.21	107735.5125	319210.27
unit_of_wage	25480	4	Year	22962	NaN	NaN	NaN	NaN	NaN	NaN	NaN
full_time_position	25480	2	Y	22773	NaN	NaN	NaN	NaN	NaN	NaN	NaN
case_status	25480	2	Certified	17018	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	count	mean	std	min	25%	50%	75%	max
no_of_employees	25480.0	5667.089207	22877.917453	11.0000	1022.00	2109.00	3504.0000	602069.00
yr_of_estab	25480.0	1979.409929	42.366929	1800.0000	1976.00	1997.00	2005.0000	2016.00
prevailing_wage	25480.0	74455.814592	52815.942327	2.1367	34015.48	70308.21	107735.5125	319210.27

	Gradient Boosting tuned with oversampled data	XGBoost tuned with oversampled data	AdaBoost tuned with oversampled data	Random forest tuned with undersampled data
Accuracy	0.803618	0.803996	0.787669	0.736367
Recall	0.860572	0.988920	0.841014	0.765659
Precision	0.772570	0.721919	0.759936	0.723285
F1	0.814200	0.834585	0.798422	0.743869

	Gradient Boosting tuned with oversampled data	XGBoost tuned with oversampled data	AdaBoost tuned with oversampled data	Random forest tuned with undersampled data
Accuracy	0.736299	0.710714	0.734118	0.709842
Recall	0.849804	0.958206	0.839356	0.733348
Precision	0.776452	0.710000	0.779462	0.813768
F1	0.811474	0.815638	0.808301	0.771468

Model	Sampling	Val Accuracy	Val Recall	Val Precision	Val F1	Key Strength
Random Forest	Undersampled	~0.71	~0.73	~0.81	~0.77	Simple, balanced
Gradient Boosting	Oversampled	~0.74	~0.85	~0.78	~0.81	Best overall balance
XGBoost	Oversampled	~0.71	~0.96	~0.71	~0.82	Extreme recall
AdaBoost	Oversampled	~0.73	~0.84	~0.78	~0.81	Safer middle-ground

	case_id	continent	education_of_employee	has_job_experience	requires_job_training	no_of_employees	yr_of_estab	region_of_employment	prevailing_wage	unit_of_wage	full_time_position	case_status
0	EZYV01	Asia	High School	N	N	14513	2007	West	592.2029	Hour	Y	Denied
1	EZYV02	Asia	Master's	Y	N	2412	2002	Northeast	83425.6500	Year	Y	Certified
2	EZYV03	Asia	Bachelor's	N	Y	44444	2008	West	122996.8600	Year	Y	Denied
3	EZYV04	Asia	Bachelor's	N	N	98	1897	West	83434.0300	Year	Y	Denied
4	EZYV05	Africa	Master's	Y	N	1082	2005	South	149907.3900	Year	Y	Certified

	case_id	continent	education_of_employee	has_job_experience	requires_job_training	no_of_employees	yr_of_estab	region_of_employment	prevailing_wage	unit_of_wage	full_time_position	case_status
25475	EZYV25476	Asia	Bachelor's	Y	Y	2601	2008	South	77092.57	Year	Y	Certified
25476	EZYV25477	Asia	High School	Y	N	3274	2006	Northeast	279174.79	Year	Y	Certified
25477	EZYV25478	Asia	Master's	Y	N	1121	1910	South	146298.85	Year	N	Certified
25478	EZYV25479	Asia	Master's	Y	Y	1918	1887	West	86154.77	Year	Y	Certified
25479	EZYV25480	Asia	Bachelor's	Y	N	3195	1960	Midwest	70876.91	Year	Y	Certified

	case_id	continent	education_of_employee	has_job_experience	requires_job_training	no_of_employees	yr_of_estab	region_of_employment	prevailing_wage	unit_of_wage	full_time_position	case_status
245	EZYV246	Europe	Master's	N	N	-25	1980	Northeast	39452.9900	Year	Y	Certified
378	EZYV379	Asia	Bachelor's	N	Y	-11	2011	Northeast	32506.1400	Year	Y	Denied
832	EZYV833	South America	Master's	Y	N	-17	2002	South	129701.9400	Year	Y	Certified
2918	EZYV2919	Asia	Master's	Y	N	-26	2005	Midwest	112799.4600	Year	Y	Certified
6439	EZYV6440	Asia	Bachelor's	N	N	-14	2013	South	103.9700	Hour	Y	Denied
6634	EZYV6635	Asia	Bachelor's	Y	N	-26	1923	West	5247.3200	Year	Y	Denied
7224	EZYV7225	Europe	Doctorate	N	N	-25	1998	Midwest	141435.9500	Year	Y	Certified
7281	EZYV7282	Asia	High School	N	N	-14	2000	Midwest	58488.5000	Year	Y	Denied
7318	EZYV7319	Asia	Bachelor's	Y	Y	-26	2006	South	115005.6100	Year	Y	Certified
7761	EZYV7762	Asia	Master's	N	N	-11	2009	Midwest	38457.5100	Year	Y	Certified
9872	EZYV9873	Europe	Master's	Y	N	-26	1996	South	37397.0500	Year	Y	Certified
11493	EZYV11494	Asia	High School	Y	N	-14	1999	South	27599.3500	Year	Y	Denied
13471	EZYV13472	North America	Master's	N	N	-17	2003	Northeast	257.2413	Hour	Y	Denied
14022	EZYV14023	Asia	Bachelor's	N	Y	-11	1946	Northeast	108403.5600	Year	Y	Certified
14146	EZYV14147	Asia	Bachelor's	N	Y	-26	1954	West	81982.2700	Year	Y	Certified
14726	EZYV14727	Asia	Master's	N	N	-11	2000	Midwest	167851.8000	Year	Y	Certified
15600	EZYV15601	Asia	Bachelor's	N	N	-14	2014	South	24641.6100	Year	Y	Denied
15859	EZYV15860	Asia	High School	N	N	-11	1969	South	44640.6000	Year	Y	Denied
16157	EZYV16158	Asia	Master's	Y	N	-11	1994	South	62681.2500	Year	Y	Certified
16883	EZYV16884	North America	Bachelor's	Y	N	-26	1968	Northeast	168.1558	Hour	Y	Denied
17006	EZYV17007	Asia	Doctorate	Y	N	-11	1984	West	25753.5100	Year	Y	Denied
17655	EZYV17656	North America	Bachelor's	Y	N	-17	2007	Northeast	129753.1800	Year	Y	Denied
17844	EZYV17845	Asia	Bachelor's	N	N	-14	2012	West	29325.8500	Year	Y	Denied
17983	EZYV17984	Asia	Bachelor's	N	N	-26	2004	South	84359.9800	Year	Y	Denied
20815	EZYV20816	Asia	Bachelor's	N	Y	-17	1990	West	91897.5700	Year	Y	Certified
20984	EZYV20985	Europe	Doctorate	Y	N	-14	1989	Midwest	37012.8000	Year	Y	Certified
21255	EZYV21256	North America	High School	N	N	-25	1987	South	99405.4700	Year	N	Denied
21760	EZYV21761	Asia	Bachelor's	Y	N	-25	2000	West	100463.5800	Year	Y	Certified
21944	EZYV21945	Africa	Master's	Y	N	-25	1977	Midwest	79150.5100	Year	Y	Certified
22084	EZYV22085	North America	Bachelor's	Y	N	-14	1980	West	691.0609	Hour	Y	Denied
22388	EZYV22389	Asia	Master's	Y	N	-14	1986	South	17893.1100	Year	Y	Certified
23186	EZYV23187	Asia	Master's	N	Y	-11	2007	Midwest	120195.3500	Year	Y	Certified
23476	EZYV23477	Europe	Master's	Y	N	-11	2000	West	95072.7500	Year	Y	Denied

Problem Statement¶

Business Context¶

Objective¶

Data Description¶

Installing and Importing the necessary libraries¶

Import Dataset¶

Overview of the Dataset¶

View the first and last 5 rows of the dataset¶

Understand the shape of the dataset¶

Observations:¶

Check the data types of the columns for the dataset¶

Observations:¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Let's check the statistical summary of the data¶

Observations¶

Fixing the negative values in number of employees columns¶

Observations:¶

Let's check the count of each unique category in each of the categorical variables¶

Observations:¶

Univariate Analysis¶

Observations on education of employee¶

Observations¶

Observations on region of employment¶

Observations¶

Observations on job experience¶

Observations¶

Observations on case status¶

Observations¶

Bivariate Analysis¶

Observations¶

Does higher education increase the chances of visa certification for well-paid jobs abroad?¶

Observations¶

How does visa status vary across different continents?¶

Observations¶

Does having prior work experience influence the chances of visa certification for career opportunities abroad?¶

Observations¶

Is the prevailing wage consistent across all regions of the US?¶

Observations¶

Does visa status vary with changes in the prevailing wage set to protect both local talent and foreign workers?¶

Observations¶

Does the unit of prevailing wage (Hourly, Weekly, etc.) have any impact on the likelihood of visa application certification?¶

Observations¶

Data Pre-processing¶

Outlier Check¶

Observations¶

Data Preparation for modeling¶

Model Building¶

Model Evaluation Criterion¶

Defining scorer to be used for cross-validation and hyperparameter tuning¶

Model building with Original data¶

Observations¶

Model Building with Oversampled data¶

Observations¶

Model Building with Undersampled data¶

Observations¶

Hyperparameter Tuning¶

Tuning AdaBoost tuning oversampled data¶

Observations¶

Observations¶

Observations¶

Observations¶

Model Performance Summary and Final Model Selection¶

Observations¶

Model Selection:¶

Actionable Insights and Recommendations¶

Final Pick for EasyVisa:¶