Page 74 - Data Science Algorithms in a Week
P. 74

Decision Trees


                                    str(child_data) + "\n")
                        printfv(
                            2, verbose, "Now, given the partitions, let us form the " +
                                        "branches and the child nodes.\n")
                    for child_group, child_data in data_groups.items():
                        child = TreeNode(heading[selected_col], child_group)
                        printfv(2, verbose, "\nWe add a child node " + child.name() +
                                " to the node " + node.name() + ". " +
                                "This branch classifies %d feature(s): " +
                                str(child_data) + "\n", len(child_data))
                        add_children_to_node(verbose, child, heading, child_data, list(
                            available_columns), enquired_column, m)
                        node.add_child(child)
                    printfv(2, verbose,
                            "\nNow, we have added all the children nodes for the " +
                            "node " + node.name() + ".\n")

                # Selects an available column/attribute with the highest
                # information gain.
                def select_col(verbose, heading, complete_data, available_columns,
                               enquired_column, m):
                    # Consider only a subset of the available columns of size m.
                    printfv(2, verbose,
                            "The available variables that we have still left are " +
                            str(numbers_to_strings(available_columns, heading)) + ". ")
                    if len(available_columns) < m:
                        printfv(
                            2, verbose, "As there are fewer of them than the " +
                                        "parameter m=%d, we consider all of them. ", m)
                        sample_columns = available_columns
                    else:
                        sample_columns = random.sample(available_columns, m)
                        printfv(2, verbose,
                                "We choose a subset of them of size m to be " +
                                str(numbers_to_strings(available_columns, heading)) +
                                ".")

                    selected_col = -1
                    selected_col_information_gain = -1
                    for col in sample_columns:
                        current_information_gain = col_information_gain(
                            complete_data, col, enquired_column)
                        # print len(complete_data),col,current_information_gain
                        if current_information_gain > selected_col_information_gain:
                            selected_col = col
                            selected_col_information_gain = current_information_gain
                    printfv(2, verbose,
                            "Out of these variables, the variable with " +


                                                     [ 62 ]
   69   70   71   72   73   74   75   76   77   78   79