Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

#Train Test Split from sklearn.model _ selection import train _ test _ split X _ train, X _ test, y _ train, y _ test

#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=123)
#30% of the given data is used as testing data, the remaining 70% is training data. This selection was made randomly.
#%%
from sklearn.ensemble import RandomForestClassifier
rf_default = RandomForestClassifier(random_state=123)
rf_default.fit(X_train, y_train)
y_predict_rf = rf_default.predict(X_test)
#%%
def evaluate_model(y_predict, y_test):
# Evaluate the performance of model using the test data.
# Use accuracy score, precision, recall and confusion matrix as performance metrics.
confusion_matrix_= metrics.confusion_matrix(y_test, y_predict)
sns.heatmap(confusion_matrix_, annot=True, fmt="d")
print("Accuracy: ","{:.2f}".format(metrics.accuracy_score(y_test, y_predict)),
"
Precision: ","{:.2f}".format(metrics.precision_score(y_test, y_predict)),
"
Recall: ","{:.2f}".format(metrics.recall_score(y_test, y_predict)),
"
Confusion Matrix: ")
#%%
evaluate_model(y_predict_rf, y_test)
#We obtain highest accuracy level, precision and recall. However, we can use grid search cross validation to check our models performance again.
#Accoding to Breiman (2001), who proposed Random Forest, max_features and n_estimators ar most important parameters of Random Forest. We can try to optimize them.
#In addition to this , we may try to balance the class weights to overcome imbalance data problem.
#%%
params ={
'max_features': ["auto", "sqrt", "log2"],
'n_estimators': [300,500,700,1000]
}
#%%
rf_default = RandomForestClassifier(class_weight= "balanced_subsample", random_state=123)
stratified_kfold = StratifiedKFold(n_splits=10, shuffle = True, random_state=123)
grid_search = GridSearchCV(rf_default, params, n_jobs=-1, cv=stratified_kfold, verbose=2)
grid_search_results = grid_search.fit(X_new, y.values.ravel())
#%%
target = 'class'
X = mushroom_data.drop(columns=[target])
y = mushroom_data[target]
print(f'Y shape ={y.shape}')
print(f'X shape ={X.shape}')
#%%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'shape of X Train ={X_train.shape}')
print(f'shape of X Test ={X_test.shape}')
print(f'shape of Y Train ={y_train.shape}')
print(f'shape of Y Test ={y_test.shape}')
#%%
acc_baseline = y_train.value_counts(normalize= True).max()
print(f'Accuracy of baseline ={acc_baseline}')
#%%
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
clf = make_pipeline(OrdinalEncoder(),
RandomForestClassifier(random_state=42))
params ={
'randomforestclassifier__n_estimators': range(25,100,25),
'randomforestclassifier__max_depth': range(10,70,10)
}
params
#%%
# summarize results
print("Best: %f using %s"%(grid_search_results.best_score_, grid_search_results.best_params_))
#%%
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(
clf,
param_grid= params,
cv=5,
n_jobs=-1,
verbose =1
)
model
#%%
model.fit(X_train , y_train)
#%%
cv_results = pd.DataFrame(model.cv_results_)
cv_results.sort_values(by= 'rank_test_score')
#%%
#evaluate_model(y_pred, y)
cv_results.sort_values(by='rank_test_score')
rf_model = RandomForestClassifier(class_weight="balanced_subsample", max_features='auto', n_estimators=300,
random_state=123)
rf_default.fit(X, y)
y_pred = rf_default.predict(X)
#%%
features = X_test.columns
importances = model.best_estimator_.named_steps['randomforestclassifier'].feature_importances_
feat_imp = pd.Series(importances , index = features).sort_values()
feat_imp.tail().plot(kind= 'barh')
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance"); THE PYTHON CODE GIVEN ABOVE IS RELATED TO RANDOM FOREST CLASSIFICATION IN THE DATA SCIENCE COURSE.
PLEASE INTERPRET THIS CODE AND PREPARE A REPORT and presentation ACCORDING TO THE SUBJECTS AND CODES.

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Linked Data A Geographic Perspective

Authors: Glen Hart, Catherine Dolbear

1st Edition

1000218910, 9781000218916

More Books

Students also viewed these Databases questions