Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

#Data Visualization import seaborn as sns import matplotlib.pyplot as plt sns . barplot ( x = class, y = data [ class ]

#Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(x="class", y=data["class"].index, palette='mako', data=mushroom_data)
#The number of poisonous mushrooms is almost twice the number of normal mushrooms. There is an imbalance data problem.
#We will be using Matplotlib pyplot and Seaborn to plot our data.
#%%
from sklearn import preprocessing
#Label encoding is used to convert categorical features to numerical values.
def label_encode_fit(mushroom_data, columns):
result = mushroom_data.copy()
encoders ={}
for column in columns:
encoder = preprocessing.LabelEncoder()
result[column]= encoder.fit_transform(result[column])
encoders[column]= encoder
return result, encoders
#%%
data1, encoders1= label_encode_fit(data,data.columns)
data1.head(10)
#%%
def correlation_map(mushroom_data, method):
corr = mushroom_data.corr(method)
ix = corr.sort_values('class', ascending=False).index
df_sorted_by_correlation = mushroom_data.loc[:, ix]
corr = df_sorted_by_correlation.corr(method)
plt.subplots(figsize=(18,14))
with sns.axes_style("white"):
# display a correlation heatmap
ax = sns.heatmap(corr, annot=True)
plt.show()
#%%
correlation_map(data1, method="spearman")
#Gill_size has the highest correlation with class. It should be included to the model.
#There some highly correlated variables such as, gill-color & ring-type, gill-color & bruises, bruises & stalk-surface-below-ring etc. These highly correlated variables ohuld be discarded from the model to obtain more accurate results.
#%%
y = data1[['class']] # contains only "class", target, variable.
X = data1.iloc[:,1:] # contains independent variable.
#%%
from sklearn.feature_selection import SelectKBest
import numpy as np
def SelectKBestCustomized(mushroom_data, k, score_func, target="class"):
X=mushroom_data.drop(columns=target)
y=mushroom_data[target]
np.random.seed(123) # for mutual_info regression
fs = SelectKBest(score_func=score_func, k=k)
fs.fit(X, y)
mask = fs.get_support()
selected_features =[feature for bool, feature in zip(mask, X.columns) if bool]
return selected_features
#%%
from sklearn.feature_selection import mutual_info_classif
mutual_info_classif(X, y, random_state=123)
#%%
mutual_info_selection = SelectKBestCustomized(data1,9, mutual_info_classif)
#%%
mutual_info_selection
#%%
X_new = X[['odor','gill-size',
'gill-color',
'stalk-surface-above-ring',
'stalk-surface-below-ring',
'stalk-color-above-ring',
'stalk-color-below-ring',
'ring-type',
'spore-print-color']]
#%%
data_selected_features = data1[['odor',
'gill-size',
'gill-color',
'stalk-surface-above-ring',
'stalk-surface-below-ring',
'stalk-color-above-ring',
'stalk-color-below-ring',
'ring-type',
'spore-print-color',
'class']]
#%%
a =5 # number of rows
b =3 # number of columns
c =1 # initialize plot counter
fig = plt.figure(figsize=(14,22))
for i in data_selected_features:
plt.subplot(a, b, c)
#plt.title('{}, subplot: {}{}{}'.format(i, a, b, c))
plt.xlabel(i)
sns.barplot(x=i, y=data_selected_features[i].index, palette='Set3_r', hue="class", data=data_selected_features)
c = c +1
plt.show() THE PYTHON CODE GIVEN ABOVE IS RELATED TO RANDOM FOREST CLASSIFICATION IN THE DATA SCIENCE COURSE.
PLEASE INTERPRET THIS CODE AND PREPARE A REPORT ACCORDING TO THE SUBJECTS AND CODES.

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Graph Database Modeling With Neo4j

Authors: Ajit Singh

2nd Edition

B0BDWT2XLR, 979-8351798783

More Books

Students also viewed these Databases questions