Best Practices

Best Practices

September 15, 2021

Feature Engineering

Normalize parameter to get the percentage

Following example shows how to use normalize input parameter to get the grouping in percentage

print(y_test.value_counts(normalize=True)*100)

Exited 0 79.25 1 20.75

Split to Train, Test and Validation

Following example shows how to

divide data into temporary and test sets with a ratio of 80:20

divide the temporary set into train and validation with a ratio of 75:25

# first we split data into 2 parts, say temporary and test

X_temp, X_test, y_temp, y_test = train_test_split(

X, y, test_size=0.2, random_state=1, stratify=y

)

# then we split the temporary set into train and validation

X_train, X_val, y_train, y_val = train_test_split(

X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp

)

print(X_train.shape, X_val.shape, X_test.shape)

Data Encoding

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["Classifier"] = le.fit_transform(df["Classifier"])

le.transform(["yes", "no"])

=> This maps the numbers in the alphabetical order, No = 0, Yes = 1

Bagging/Bucketing

# Discretize variable into equal-sized buckets based on quantiles

df["city"] = pd.qcut(

    df["city_development_index"],

    q=[0, 0.25, 0.5, 1],

    labels=["Under_Developed", "Developing", "Developed"],

)

Missing-Value Treatment – KNN Imputer

One way for missing value treatment is KNN imputer to impute missing values.

·       Each sample's missing values are imputed by looking at the n_neighbors nearest neighbors found in the training set. Default value for n_neighbors=5.

·       KNN imputer replaces missing values using the average of k nearest non-missing feature values.

·       Nearest points are found based on euclidean distance.

The values obtained might not be integer always which is not be the best way to impute categorical values

To take care of that we can round off the obtained values to nearest integer value

# Define the imputer

imputer = KNNImputer(n_neighbors=5)

# Define list of columns to impute

reqd_col_for_impute = [

    "gender",

    "enrolled_university",

]

# Convert/encode any non-numeric columns to numerical values before executing imputer

X[reqd_col_for_impute] = imputer.fit_transform(X[reqd_col_for_impute])

Missing-Value Treatment – Simple Imputer

This is another way for treatment of missing values when you want to replace values with mean, median, most frequent (mode) or constant values.

# Import the library

from sklearn.impute import SimpleImputer

# Define imputer

imputer = SimpleImputer(strategy='median', fill_value = 'numerical')

dataframe_imputed = imputer.fit_transform(dataframe)

OverSampling of Data

from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression(random_state=1)
lr1.fit(X_train,y_train)

model_performance_classification_sklearn(lr1, X_train, y_train)

sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1) #Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)

lr2 = LogisticRegression(random_state=1)
lr2.fit(X_train_over,y_train_over)

model_performance_classification_sklearn(lr2, X_train_over, y_train_over)

Confusion Matrix from Sklearn

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

rf = RandomForestClassifier(random_state = 1)
rf.fit(X_train, y_train)

confusion_matrix(y_train, rf.predict(X_train))

Cross validation Score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

models = [] # Empty list to store all the models

# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("LR", LogisticRegression(random_state=1)))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))

results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models

# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")

for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))

Using Randomized Search

from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV

# defining model
model = AdaBoostClassifier(random_state=1)

# Parameter grid to pass in GridSearchCV

param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over,y_train_over)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Under Sampling

from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)

model1 = AdaBoostClassifier(random_state=1)
model1.fit(X_train_un, y_train_un)

model_performance_classification_sklearn(model1, X_train_un, y_train_un)

model_performance_classification_sklearn(model1, X_val, y_val)

model2 = AdaBoostClassifier(random_state=1)
model2.fit(X_train_over, y_train_over)

model_performance_classification_sklearn(model2, X_train_over, y_train_over)

model_performance_classification_sklearn(model2, X_val, y_val)

Comments