Best Practices
Feature Engineering
Normalize parameter to get the percentage
Following example shows how to use normalize input parameter to get the grouping in percentage
- print(y_test.value_counts(normalize=True)*100)
Exited
0 79.25
1 20.75
Split to Train, Test and Validation
Following
example shows how to
- divide
data into temporary and test sets with a ratio of 80:20
- divide
the temporary set into train and validation with a ratio of 75:25
# first we split data into 2 parts, say temporary and test X_temp, X_test,
y_temp, y_test = train_test_split( X, y, test_size=0.2, random_state=1,
stratify=y ) # then we split the
temporary set into train and validation X_train, X_val,
y_train, y_val = train_test_split( X_temp, y_temp, test_size=0.25,
random_state=1, stratify=y_temp ) print(X_train.shape,
X_val.shape, X_test.shape) |
Data Encoding
from
sklearn.preprocessing import LabelEncoder
le = LabelEncoder() df["Classifier"] =
le.fit_transform(df["Classifier"]) le.transform(["yes", "no"]) |
=> This maps the numbers in the
alphabetical order, No = 0, Yes = 1
Bagging/Bucketing
# Discretize variable into equal-sized buckets based on
quantiles df["city"]
= pd.qcut( df["city_development_index"], q=[0, 0.25, 0.5, 1], labels=["Under_Developed",
"Developing", "Developed"], ) |
Missing-Value Treatment – KNN Imputer
One way for missing value treatment
is KNN imputer to impute missing values.
· Each sample's missing values are imputed by looking at the
n_neighbors nearest neighbors found in the training set. Default value for
n_neighbors=5.
· KNN imputer replaces missing values using the average of k nearest
non-missing feature values.
· Nearest points are found based on euclidean distance.
The
values obtained might not be integer always which is not be the best way to
impute categorical values
- To
take care of that we can round off the obtained values to nearest integer
value
# Define the imputer imputer =
KNNImputer(n_neighbors=5) # Define list of
columns to impute reqd_col_for_impute =
[ "gender", "enrolled_university", ] # Convert/encode any
non-numeric columns to numerical values before executing imputer X[reqd_col_for_impute]
= imputer.fit_transform(X[reqd_col_for_impute]) |
Missing-Value Treatment – Simple Imputer
This is another way for treatment of
missing values when you want to replace values with mean, median, most frequent
(mode) or constant values.
# Import
the library from sklearn.impute
import SimpleImputer
# Define imputer imputer =
SimpleImputer(strategy='median', fill_value = 'numerical') dataframe_imputed = imputer.fit_transform(dataframe) |
OverSampling of Data
from sklearn.linear_model import
LogisticRegression
lr1 = LogisticRegression(random_state=1)
lr1.fit(X_train,y_train)
model_performance_classification_sklearn(lr1, X_train, y_train)
sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1) #Synthetic
Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
lr2 = LogisticRegression(random_state=1)
lr2.fit(X_train_over,y_train_over)
model_performance_classification_sklearn(lr2, X_train_over, y_train_over)
Confusion Matrix from Sklearn
from sklearn.ensemble import
RandomForestClassifier
from sklearn.metrics import
confusion_matrix
rf = RandomForestClassifier(random_state
= 1)
rf.fit(X_train, y_train)
confusion_matrix(y_train,
rf.predict(X_train))
Cross validation Score
from sklearn.tree import
DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
models = [] # Empty list to store all the
models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest",
RandomForestClassifier(random_state=1)))
models.append(("LR", LogisticRegression(random_state=1)))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all
model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring,
cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() *
100))
Using Randomized Search
from sklearn.ensemble import
AdaBoostClassifier
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10,
110, 10),
"learning_rate": [0.1, 0.01,
0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1,
random_state=1),
DecisionTreeClassifier(max_depth=2,
random_state=1),
DecisionTreeClassifier(max_depth=3,
random_state=1),
],
}
# Type of scoring used to compare
parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model,
param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5,
random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over,y_train_over)
print("Best parameters are {} with
CV score={}:"
.format(randomized_cv.best_params_,randomized_cv.best_score_))
Under Sampling
from imblearn.under_sampling import
RandomUnderSampler
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
model1 =
AdaBoostClassifier(random_state=1)
model1.fit(X_train_un, y_train_un)
model_performance_classification_sklearn(model1,
X_train_un, y_train_un)
model_performance_classification_sklearn(model1,
X_val, y_val)
model2 =
AdaBoostClassifier(random_state=1)
model2.fit(X_train_over, y_train_over)
model_performance_classification_sklearn(model2,
X_train_over, y_train_over)
model_performance_classification_sklearn(model2,
X_val, y_val)
Comments
Post a Comment