Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

import numpy as np from collections import Counter from sklearn import datasets, model _ selection # No other libraries will be imported # load the

import numpy as np
from collections import Counter
from sklearn import datasets, model_selection
# No other libraries will be imported
# load the Iris Dataset, which contains 150 samples.
# each sample has 4 features.
# the dataset contains 3 classes of 50 instances each, where each class refers to a type of iris plant.
iris = datasets.load_iris()
X = np.array(iris.data) # features, numeric attributes. [Sepal length, Sepal Width, Petal length, Petal width]
Y = np.array(iris.target) # labels: class-0, class-1, class-2
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25, random_state=0)
print("Train Shape:", X_train.shape)
print("Train Shape:", X_test.shape)
3. Calculate Information Gain for each attribute (numeric), and show the feature that should be used first when build a decision tree.
step-1: find the best cutpoint for each attribute. (find value to split the data)
step-2: calculate the information gain for each attribute. (decide the order of attributes when build DT)
#-------------------- Some helper functions ------------------------------
# calculate Entropy for a given distribution H(X).
def entropy(probabilities: list)-> float:
return sum([-p * np.log2(p) for p in probabilities if p>0])
# given a list of labels, return the probability for each class P(Y).
def class_probabilities(labels: list)-> list:
total_count = len(labels)
return [label_count / total_count for label_count in Counter(labels).values()]
# calculate the Entropy H(Y) for a given list of labels.
def data_entropy(labels: list)-> float:
return entropy(class_probabilities(labels))
# split data into two sub-groups [group1, goup2] based on attribute [feature_idx] and value [feature_val]
# if sample[feature_idx]< feature_val:
# group1<- sample
# else:
# group2<- sample
def split_data(data: np.array, feature_idx: int, feature_val: float)-> tuple:
mask_below_threshold = data[:, feature_idx]< feature_val
group1= data[mask_below_threshold]
group2= data[~mask_below_threshold]
return group1, group2
# calculate the entropy for current partition. H(Y|X=feature_val)
def partition_entropy(g1_labels: list, g2_labels:list)-> float:
total_count = len(g1_labels)+ len(g2_labels)
#weighted combination of conditional entropy in both group1 and group2.
return data_entropy(g1_labels)*(len(g1_labels)/total_count)+ data_entropy(g2_labels)*(len(g2_labels)/total_count)
#-----------------------------------------------------------------------------------------
#---------------------------- Examples to use the Helper functions -----------------------
# calculate the H(Y) for the train and test data:
print(data_entropy(Y_train))
print(data_entropy(Y_test))
## to split the data based on feature_idx and feature_val:
train_data = np.concatenate((X_train, np.reshape(Y_train, (-1,1))), axis=1) # concatenate [X_train, Y_train]
print(train_data.shape)
# split the data into two subgroups
g1, g2= split_data(train_data, feature_idx=1, feature_val=3)
print(g1.shape)
print(g2.shape)
# calculate the weighted entropy for the current split.
print(partition_entropy(g1[:,-1], g2[:,-1]))
#-----------------------------------------------------------------------------------------
#-------------------------------- Your implementation ------------------------------------
# Initialize variables to store the best cutpoint and information gain for each attribute
#-----------------------------------------------------------------------------------------
#----------------------------------- Printing --------------------------------------------
#print the calculated cutpoint [feature_val] and information gain for each attribute.
# print the feature should be used first when build decision tree.
Please help me complete this code

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Concepts of Database Management

Authors: Philip J. Pratt, Mary Z. Last

8th edition

1285427106, 978-1285427102

More Books

Students also viewed these Databases questions