Page 72 - Data Science Algorithms in a Week
P. 72

Decision Trees


                    def is_root(self):
                        return self.var is None and self.val is None

                    def is_leaf(self):
                        return len(self.children) == 0

                    def name(self):
                        if self.is_root():
                            return "[root]"
                        return "[" + self.var + "=" + self.val + "]"

                # Constructs a decision tree where heading is the heading of the table
                # with the data, i.e. the names of the attributes.
                # complete_data are data samples with a known value for every attribute.
                # enquired_column is the index of the column (starting from zero) which
                # holds the classifying attribute.
                def constuct_decision_tree(verbose, heading, complete_data,
                enquired_column):
                    return construct_general_tree(verbose, heading, complete_data,
                                                  enquired_column, len(heading))
                # m is the number of the classifying variables that should be at most
                # considered at each node. m needed only for a random forest.
                def construct_general_tree(verbose, heading, complete_data,
                                           enquired_column, m):
                    available_columns = []
                    for col in range(0, len(heading)):
                        if col != enquired_column:
                            available_columns.append(col)
                    tree = TreeNode()
                    printfv(2, verbose, "We start the construction with the root node" +
                                        " to create the first node of the tree.\n")
                    add_children_to_node(verbose, tree, heading, complete_data,
                                         available_columns, enquired_column, m)
                    return tree

                # Splits the data samples into the groups with each having a different
                # value for the attribute at the column col.
                def split_data_by_col(data, col):
                    data_groups = {}
                    for data_item in data:
                        if data_groups.get(data_item[col]) is None:
                            data_groups[data_item[col]] = []
                        data_groups[data_item[col]].append(data_item)
                    return data_groups

                # Adds a leaf node to node.
                def add_leaf(verbose, node, heading, complete_data, enquired_column):


                                                     [ 60 ]
   67   68   69   70   71   72   73   74   75   76   77