Page 74 - Data Science Algorithms in a Week

P. 74

Decision Trees

str(child_data) + "\n")
printfv(
2, verbose, "Now, given the partitions, let us form the " +
"branches and the child nodes.\n")
for child_group, child_data in data_groups.items():
child = TreeNode(heading[selected_col], child_group)
printfv(2, verbose, "\nWe add a child node " + child.name() +
" to the node " + node.name() + ". " +
"This branch classifies %d feature(s): " +
str(child_data) + "\n", len(child_data))
add_children_to_node(verbose, child, heading, child_data, list(
available_columns), enquired_column, m)
node.add_child(child)
printfv(2, verbose,
"\nNow, we have added all the children nodes for the " +
"node " + node.name() + ".\n")

# Selects an available column/attribute with the highest
# information gain.
def select_col(verbose, heading, complete_data, available_columns,
enquired_column, m):
# Consider only a subset of the available columns of size m.
printfv(2, verbose,
"The available variables that we have still left are " +
str(numbers_to_strings(available_columns, heading)) + ". ")
if len(available_columns) < m:
printfv(
2, verbose, "As there are fewer of them than the " +
"parameter m=%d, we consider all of them. ", m)
sample_columns = available_columns
else:
sample_columns = random.sample(available_columns, m)
printfv(2, verbose,
"We choose a subset of them of size m to be " +
str(numbers_to_strings(available_columns, heading)) +
".")

selected_col = -1
selected_col_information_gain = -1
for col in sample_columns:
current_information_gain = col_information_gain(
complete_data, col, enquired_column)
# print len(complete_data),col,current_information_gain
if current_information_gain > selected_col_information_gain:
selected_col = col
selected_col_information_gain = current_information_gain
printfv(2, verbose,
"Out of these variables, the variable with " +

[ 62 ]

69 70 71 72 73 74 75 76 77 78 79