Page 75 - Data Science Algorithms in a Week
P. 75

Decision Trees


                            "the highest information gain is the variable " +
                            heading[selected_col] +
                            ". Thus we will branch the node further on this " +
                            "variable. " +
                            "We also remove this variable from the list of the " +
                            "available variables for the children of the current node. ")
                    return selected_col

                # Calculates the information gain when partitioning complete_data
                # according to the attribute at the column col and classifying by the
                # attribute at enquired_column.
                def col_information_gain(complete_data, col, enquired_column):
                    data_groups = split_data_by_col(complete_data, col)
                    information_gain = entropy(complete_data, enquired_column)
                    for _, data_group in data_groups.items():
                        information_gain -= (float(len(data_group)) / len(complete_data)
                                             ) * entropy(data_group, enquired_column)
                    return information_gain

                # Calculates the entropy of the data classified by the attribute
                # at the enquired_column.
                def entropy(data, enquired_column):
                    value_counts = {}
                    for data_item in data:
                        if value_counts.get(data_item[enquired_column]) is None:
                            value_counts[data_item[enquired_column]] = 0
                        value_counts[data_item[enquired_column]] += 1
                    entropy = 0
                    for _, count in value_counts.items():
                        probability = float(count) / len(data)
                        entropy -= probability * math.log(probability, 2)
                    return entropy
            Program input:

            We input the data from the swim preference example into the program to construct a
            decision tree:

                # source_code/3/swim.csv
                swimming_suit,water_temperature,swim
                None,Cold,No
                None,Warm,No
                Small,Cold,No
                Small,Warm,No
                Good,Cold,No
                Good,Warm,Yes




                                                     [ 63 ]
   70   71   72   73   74   75   76   77   78   79   80