Page 97 - Data Science Algorithms in a Week
P. 97

Random Forest


                    m = int(min(M, math.ceil(2 * math.sqrt(M))))
                    printfv(2, verbose, "We are given M=" + str(M) +
                            " variables according to which a feature can be " +
                            "classified. ")
                    printfv(3, verbose, "In random forest algorithm we usually do " +
                            "not use all " + str(M) + " variables to form tree " +
                            "branches at each node. ")
                    printfv(3, verbose, "We use only m variables out of M. ")
                    printfv(3, verbose, "So we choose m such that m is less than or " +
                            "equal to M. ")
                    printfv(3, verbose, "The greater m is, a stronger classifier an " +
                            "individual tree constructed is. However, it is more " +
                            "susceptible to a bias as more of the data is considered. " +
                            "Since we in the end use multiple trees, even if each may " +
                            "be a weak classifier, their combined classification " +
                            "accuracy is strong. Therefore as we want to reduce a " +
                            "bias in a random forest, we may want to consider to " +
                            "choose a parameter m to be slightly less than M.\n")
                    printfv(2, verbose, "Thus we choose the maximum number of the " +
                            "variables considered at the node to be " +
                            "m=min(M,math.ceil(2*math.sqrt(M)))" +
                            "=min(M,math.ceil(2*math.sqrt(%d)))=%d.\n", M, m)
                    return m
                #Classification
                def display_classification(verbose, random_forest, heading,
                                           enquired_column, incomplete_data):
                    if len(incomplete_data) == 0:
                        printfv(0, verbose, "No data to classify.\n")
                    else:
                        for incomplete_feature in incomplete_data:
                            printfv(0, verbose, "\nFeature: " +
                                    str(incomplete_feature) + "\n")
                            display_classification_for_feature(
                                verbose, random_forest, heading,
                                enquired_column, incomplete_feature)

                def display_classification_for_feature(verbose, random_forest, heading,
                                                       enquired_column, feature):
                    classification = {}
                    for i in range(0, len(random_forest)):
                        group = decision_tree.classify_by_tree(
                            random_forest[i], heading, enquired_column, feature)
                        common.dic_inc(classification, group)
                        printfv(0, verbose, "Tree " + str(i) +
                                " votes for the class: " + str(group) + "\n")
                    printfv(0, verbose, "The class with the maximum number of votes " +
                            "is '" + str(common.dic_key_max_count(classification)) +


                                                     [ 85 ]
   92   93   94   95   96   97   98   99   100   101   102