Page 97 - Data Science Algorithms in a Week

P. 97

Random Forest

m = int(min(M, math.ceil(2 * math.sqrt(M))))
printfv(2, verbose, "We are given M=" + str(M) +
" variables according to which a feature can be " +
"classified. ")
printfv(3, verbose, "In random forest algorithm we usually do " +
"not use all " + str(M) + " variables to form tree " +
"branches at each node. ")
printfv(3, verbose, "We use only m variables out of M. ")
printfv(3, verbose, "So we choose m such that m is less than or " +
"equal to M. ")
printfv(3, verbose, "The greater m is, a stronger classifier an " +
"individual tree constructed is. However, it is more " +
"susceptible to a bias as more of the data is considered. " +
"Since we in the end use multiple trees, even if each may " +
"be a weak classifier, their combined classification " +
"accuracy is strong. Therefore as we want to reduce a " +
"bias in a random forest, we may want to consider to " +
"choose a parameter m to be slightly less than M.\n")
printfv(2, verbose, "Thus we choose the maximum number of the " +
"variables considered at the node to be " +
"m=min(M,math.ceil(2*math.sqrt(M)))" +
"=min(M,math.ceil(2*math.sqrt(%d)))=%d.\n", M, m)
return m
#Classification
def display_classification(verbose, random_forest, heading,
enquired_column, incomplete_data):
if len(incomplete_data) == 0:
printfv(0, verbose, "No data to classify.\n")
else:
for incomplete_feature in incomplete_data:
printfv(0, verbose, "\nFeature: " +
str(incomplete_feature) + "\n")
display_classification_for_feature(
verbose, random_forest, heading,
enquired_column, incomplete_feature)

def display_classification_for_feature(verbose, random_forest, heading,
enquired_column, feature):
classification = {}
for i in range(0, len(random_forest)):
group = decision_tree.classify_by_tree(
random_forest[i], heading, enquired_column, feature)
common.dic_inc(classification, group)
printfv(0, verbose, "Tree " + str(i) +
" votes for the class: " + str(group) + "\n")
printfv(0, verbose, "The class with the maximum number of votes " +
"is '" + str(common.dic_key_max_count(classification)) +

[ 85 ]

92 93 94 95 96 97 98 99 100 101 102