Answered step by step

Verified Expert Solution

Link Copied!

Question

1 Approved Answer

Posted on Sep 26, 2024

import java.util.*; public class DTMain { public static void main(String[] args) { // TODO Auto-generated method stub // parameters: train_feature_fname, train_label_fname, // test_feature_fname, test_label_fname, //

import java.util.*;

public class DTMain {

public static void main(String[] args) {

// TODO Auto-generated method stub

// parameters: train_feature_fname, train_label_fname,

// test_feature_fname, test_label_fname,

// max_height(int), max_num_leaves(int)

if(args.length < 6){

System.out.println("java DTMain train_feature_fname train_label_fname test_feature_fname test_label_fname max_height max_num_leaves");

return;

}

try{

String train_feature_fname = args[0];

String train_label_fname = args[1];

String test_feature_fname = args[2];

String test_label_fname = args[3];

int max_height = Integer.parseInt(args[4]);

int max_num_leaves = Integer.parseInt(args[5]);

DataWrapperClass train_data = new DataWrapperClass(train_feature_fname, train_label_fname);

DataWrapperClass test_data = new DataWrapperClass(test_feature_fname, test_label_fname);

DecisionTreeClass my_dt = new DecisionTreeClass(train_data, max_height, max_num_leaves);

ArrayList prediction = my_dt.predict(test_data);

double final_accuracy = DataWrapperClass.accuracy(prediction, test_data.labels);

System.out.println("Test Accuracy = " + final_accuracy);

} catch (Exception e) {

System.out.println("NULL: Something is wrong");

return;

}

/////////////////////////////

import java.util.*;

public class DecisionTreeClass {

private class DecisionTreeNode{

public ArrayList data_list; // list of data IDs

public int opt_fea_type = -1; // 0 if continuous, 1 if categorical

public int opt_fea_id = -1; // the index of the optimal feature

public double opt_fea_thd = Double.NEGATIVE_INFINITY; // the optimal splitting threshold

// for continuous feature

public int opt_improvement = Integer.MIN_VALUE; // the improvement if split based on the optimal feature

public boolean is_leaf = true; // is it a leaf

public int majority_class = -1; // class prediction based on majority vote

public int num_accurate = -1; // number of accurate data using majority_class

public DecisionTreeNode parent = null; // parent node

public ArrayList children; // list of children when split

public DecisionTreeNode(ArrayList d_list, int m_class, int n_acc){

data_list = new ArrayList(d_list);

majority_class = m_class;

num_accurate = n_acc;

}

public DataWrapperClass train_data;

public int max_height;

public int max_num_leaves;

public int height;

public int num_leaves;

public DecisionTreeNode root;

// constructor, build the decision tree using train_data, max_height and max_num_leaves

public DecisionTreeClass(DataWrapperClass t_d, int m_h, int m_n_l){

train_data = t_d;

max_height = m_h;

max_num_leaves = m_n_l;

// FILL IN

// create the root node, use all data_ids [0..N-1]

// find the majority class, also how many accurate using the majority class

// if the majority class is correct for all data,

// no improvement possible

// the optimal accuracy improvement = 0

// else

// find the optimal feature to split

// for each feature

// if categorical

// split the data_list into sub-lists using different values of the feature

// for each sub-list

// find the majority class

// compute the number of accurate prediction using this majority class

// sum up number of accurate predictions for all sub-lists as the score

// if continuous

// sort the data based on the continuous feature

// find the optimal threshold to split the data_list into two sub-lists

// for each of sub-list

// find the majority class

// compute the number of accurate prediction using this majority class

// sum up number of accurate predictions for all sub-lists as the score

// find the feature with the largest score (best total num of accurate prediction after splitting)

// optimal accuracy improvement = the difference between the best total num of accurate prediction after splitting

// and the number of accurate prediction using the majority class of the current node

// put the root node and the optimal accuracy improvement into a max-heap

// while the heap is not empty

// extract the maximum entry (the leaf node with the maximal optimal accuracy improvement) from the heap

// if the optimal accuracy improvement is zero (no improvement possible)

// break;

// else

// split the node

// create children based on the optimal feature and split (each sub-list creates one child)

// for each child node

// find its optimal accuracy improvement (and the optimal feature) (as you do for the root)

// put the node into the max-heap

// if the number of leaves > max_num_leaves

// break;

// if the height > max_height

// break;

}

public ArrayList predict(DataWrapperClass test_data){

// for each data in the test_data

// starting from the root,

// at each node, go to the right child based on the splitting feature

// continue until a leaf is reached

// assign the label to the data based on the majority-class of the leaf node

// return the list of predicted label

}

///////////////////////////////////////

import java.util.*;

import java.io.File;

public class DataWrapperClass {

public int num_data; // number of data (N)

public int num_features; // number of features (D)

public int num_classes; // number of different classes (K)

public int num_cont_fea; // number of continuous features

public int num_cat_fea; // number of categorical features

public ArrayList > continuous_features; // only continuous features

public ArrayList > categorical_features; // only categorical features

public ArrayList labels; // labels of all data

// read features and labels from input files

public DataWrapperClass(String feature_fname, String label_fname){

// FILL IN

// read feature and label file

// store feature in continuous_/categorical_features,

// store labels

// if file name starts with 'CAT_', all features are categorical

// otherwise, all features are continuous

}

// static function, compare two label lists, report how many are correct

public static int evaluate(ArrayList l1, ArrayList l2){

int len = l1.size();

assert len == l2.size(); // length should be equal

assert len > 0; // length should be bigger than zero

int ct = 0;

for(int i = 0; i < len; ++i){

if(l1.get(i).equals(l2.get(i))) ++ct;

}

return ct;

}

// static function, compare two label lists, report score (between 0 and 1)

public static double accuracy(ArrayList l1, ArrayList l2){

int len = l1.size();

assert len == l2.size(); // label lists should have equal length

assert len > 0; // lists should be non-empty

double score = evaluate(l1,l2);

score = score / len; // normalize by divided by the length

return score;

}

//////////////////////

Here is the data we need :

CAT_congress_test_fea.txt

0 1 1 2 2 1 1 1 1 0 1 1 2 0 0 0 2 0 1 2 0 0 0 0 0 0 2 1 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 1 1 2 2 0 1 1 1 0 1 2 2 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 1 0 0 0 2

/////////////////////////////

CAT_congress_train_fea.txt

2 1 1 2 2 0 1 1 1 0 2 1 2 1 1 0 2 2 1 2 2 1 1 1 1 1 1 1 2 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 2 1 1 2 2 0 1 1 1 1 1 1 2 0 0 1 2 1 1 2 2 1 1 1 1 0 1 1 2 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 2 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 2 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 1 1 2 2 0 1 1 0 1 1 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0

/////////////////////////////

breast_cancer_test_fea.txt

12.47000 18.60000 81.09000 481.90000 0.09965 0.10580 0.08005 0.03821 0.19250 0.06373 0.39610 1.04400 2.49700 30.29000 0.00695 0.01911 0.02701 0.01037 0.01782 0.00359 14.97000 24.64000 96.05000 677.90000 0.14260 0.23780 0.26710 0.10150 0.30140 0.08750 18.94000 21.31000 123.60000 1130.00000 0.09009 0.10290 0.10800 0.07951 0.15820 0.05461 0.78880 0.79750 5.48600 96.05000 0.00444 0.01652 0.02269 0.01370 0.01386 0.00170 24.86000 26.58000 165.90000 1866.00000 0.11930 0.23360 0.26870 0.17890 0.25510 0.06589 15.46000 19.48000 101.70000 748.90000 0.10920 0.12230 0.14660 0.08087 0.19310 0.05796 0.47430 0.78590 3.09400 48.31000 0.00624 0.01484 0.02813 0.01093 0.01397 0.00246 19.26000 26.00000 124.90000 1156.00000 0.15460 0.23940 0.37910 0.15140 0.28370 0.08019