Question
import java.util.*; public class DTMain { public static void main(String[] args) { // TODO Auto-generated method stub // parameters: train_feature_fname, train_label_fname, // test_feature_fname, test_label_fname, //
import java.util.*;
public class DTMain {
public static void main(String[] args) {
// TODO Auto-generated method stub
// parameters: train_feature_fname, train_label_fname,
// test_feature_fname, test_label_fname,
// max_height(int), max_num_leaves(int)
if(args.length < 6){
System.out.println("java DTMain train_feature_fname train_label_fname test_feature_fname test_label_fname max_height max_num_leaves");
return;
}
try{
String train_feature_fname = args[0];
String train_label_fname = args[1];
String test_feature_fname = args[2];
String test_label_fname = args[3];
int max_height = Integer.parseInt(args[4]);
int max_num_leaves = Integer.parseInt(args[5]);
DataWrapperClass train_data = new DataWrapperClass(train_feature_fname, train_label_fname);
DataWrapperClass test_data = new DataWrapperClass(test_feature_fname, test_label_fname);
DecisionTreeClass my_dt = new DecisionTreeClass(train_data, max_height, max_num_leaves);
ArrayList
double final_accuracy = DataWrapperClass.accuracy(prediction, test_data.labels);
System.out.println("Test Accuracy = " + final_accuracy);
} catch (Exception e) {
System.out.println("NULL: Something is wrong");
return;
}
}
}
/////////////////////////////
import java.util.*;
public class DecisionTreeClass {
private class DecisionTreeNode{
public ArrayList
public int opt_fea_type = -1; // 0 if continuous, 1 if categorical
public int opt_fea_id = -1; // the index of the optimal feature
public double opt_fea_thd = Double.NEGATIVE_INFINITY; // the optimal splitting threshold
// for continuous feature
public int opt_improvement = Integer.MIN_VALUE; // the improvement if split based on the optimal feature
public boolean is_leaf = true; // is it a leaf
public int majority_class = -1; // class prediction based on majority vote
public int num_accurate = -1; // number of accurate data using majority_class
public DecisionTreeNode parent = null; // parent node
public ArrayList
public DecisionTreeNode(ArrayList
data_list = new ArrayList
majority_class = m_class;
num_accurate = n_acc;
}
}
public DataWrapperClass train_data;
public int max_height;
public int max_num_leaves;
public int height;
public int num_leaves;
public DecisionTreeNode root;
// constructor, build the decision tree using train_data, max_height and max_num_leaves
public DecisionTreeClass(DataWrapperClass t_d, int m_h, int m_n_l){
train_data = t_d;
max_height = m_h;
max_num_leaves = m_n_l;
// FILL IN
//
// create the root node, use all data_ids [0..N-1]
// find the majority class, also how many accurate using the majority class
// if the majority class is correct for all data,
// no improvement possible
// the optimal accuracy improvement = 0
// else
// find the optimal feature to split
// for each feature
// if categorical
// split the data_list into sub-lists using different values of the feature
// for each sub-list
// find the majority class
// compute the number of accurate prediction using this majority class
// sum up number of accurate predictions for all sub-lists as the score
// if continuous
// sort the data based on the continuous feature
// find the optimal threshold to split the data_list into two sub-lists
// for each of sub-list
// find the majority class
// compute the number of accurate prediction using this majority class
// sum up number of accurate predictions for all sub-lists as the score
// find the feature with the largest score (best total num of accurate prediction after splitting)
// optimal accuracy improvement = the difference between the best total num of accurate prediction after splitting
// and the number of accurate prediction using the majority class of the current node
// put the root node and the optimal accuracy improvement into a max-heap
// while the heap is not empty
// extract the maximum entry (the leaf node with the maximal optimal accuracy improvement) from the heap
// if the optimal accuracy improvement is zero (no improvement possible)
// break;
// else
// split the node
// create children based on the optimal feature and split (each sub-list creates one child)
// for each child node
// find its optimal accuracy improvement (and the optimal feature) (as you do for the root)
// put the node into the max-heap
// if the number of leaves > max_num_leaves
// break;
// if the height > max_height
// break;
}
public ArrayList
// for each data in the test_data
// starting from the root,
// at each node, go to the right child based on the splitting feature
// continue until a leaf is reached
// assign the label to the data based on the majority-class of the leaf node
// return the list of predicted label
}
}
///////////////////////////////////////
import java.util.*;
import java.io.File;
public class DataWrapperClass {
public int num_data; // number of data (N)
public int num_features; // number of features (D)
public int num_classes; // number of different classes (K)
public int num_cont_fea; // number of continuous features
public int num_cat_fea; // number of categorical features
public ArrayList
public ArrayList
public ArrayList
// read features and labels from input files
public DataWrapperClass(String feature_fname, String label_fname){
// FILL IN
// read feature and label file
// store feature in continuous_/categorical_features,
// store labels
// if file name starts with 'CAT_', all features are categorical
// otherwise, all features are continuous
}
// static function, compare two label lists, report how many are correct
public static int evaluate(ArrayList
int len = l1.size();
assert len == l2.size(); // length should be equal
assert len > 0; // length should be bigger than zero
int ct = 0;
for(int i = 0; i < len; ++i){
if(l1.get(i).equals(l2.get(i))) ++ct;
}
return ct;
}
// static function, compare two label lists, report score (between 0 and 1)
public static double accuracy(ArrayList
int len = l1.size();
assert len == l2.size(); // label lists should have equal length
assert len > 0; // lists should be non-empty
double score = evaluate(l1,l2);
score = score / len; // normalize by divided by the length
return score;
}
}
//////////////////////
Here is the data we need :
CAT_congress_test_fea.txt
0 1 1 2 2 1 1 1 1 0 1 1 2 0 0 0 2 0 1 2 0 0 0 0 0 0 2 1 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 1 1 2 2 0 1 1 1 0 1 2 2 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 1 0 0 0 2
/////////////////////////////
CAT_congress_train_fea.txt
2 1 1 2 2 0 1 1 1 0 2 1 2 1 1 0 2 2 1 2 2 1 1 1 1 1 1 1 2 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 2 1 1 2 2 0 1 1 1 1 1 1 2 0 0 1 2 1 1 2 2 1 1 1 1 0 1 1 2 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 2 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 2 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 2 1 1 2 2 0 1 1 0 1 1 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0
/////////////////////////////
breast_cancer_test_fea.txt
12.47000 18.60000 81.09000 481.90000 0.09965 0.10580 0.08005 0.03821 0.19250 0.06373 0.39610 1.04400 2.49700 30.29000 0.00695 0.01911 0.02701 0.01037 0.01782 0.00359 14.97000 24.64000 96.05000 677.90000 0.14260 0.23780 0.26710 0.10150 0.30140 0.08750 18.94000 21.31000 123.60000 1130.00000 0.09009 0.10290 0.10800 0.07951 0.15820 0.05461 0.78880 0.79750 5.48600 96.05000 0.00444 0.01652 0.02269 0.01370 0.01386 0.00170 24.86000 26.58000 165.90000 1866.00000 0.11930 0.23360 0.26870 0.17890 0.25510 0.06589 15.46000 19.48000 101.70000 748.90000 0.10920 0.12230 0.14660 0.08087 0.19310 0.05796 0.47430 0.78590 3.09400 48.31000 0.00624 0.01484 0.02813 0.01093 0.01397 0.00246 19.26000 26.00000 124.90000 1156.00000 0.15460 0.23940 0.37910 0.15140 0.28370 0.08019
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started