Page 47 - Data Science Algorithms in a Week

P. 47

Naive Bayes

# 1. items in a row should be separated by a comma ','
# 2. the first row should be a heading - should contain a name for each
# column of the data.
# 3. the remaining rows should contain the data itself - rows with
# complete and rows with the incomplete data.
# A row with complete data is the row that has a non-empty and
# non-question mark value for each column. A row with incomplete data is
# the row that has the last column with the value of a question mark ?.
# Please, run this file on the example chess.csv to understand this help
# better:
# $ python naive_bayes.py chess.csv

import imp
import sys
sys.path.append('../common')
import common # noqa

# Calculates the Baysian probability for the rows of incomplete data and
# returns them completed by the Bayesian probabilities. complete_data
# are the rows with the data that is complete and are used to calculate
# the conditional probabilities to complete the incomplete data.
def bayes_probability(heading, complete_data, incomplete_data,
enquired_column):
conditional_counts = {}
enquired_column_classes = {}
for data_item in complete_data:
common.dic_inc(enquired_column_classes,
data_item[enquired_column])
for i in range(0, len(heading)):
if i != enquired_column:
common.dic_inc(
conditional_counts, (
heading[i], data_item[i],
data_item[enquired_column]))
completed_items = []
for incomplete_item in incomplete_data:
partial_probs = {}
complete_probs = {}
probs_sum = 0
for enquired_group in enquired_column_classes.items():
# For each class in the of the enquired variable A calculate
# the probability P(A)*P(B 1 |A)*P(B 2 |A)*...*P(B n |A) where
# B 1 ,...,B n are the remaining variables.
probability = float(common.dic_key_count(
enquired_column_classes,
enquired_group[0])) / len(complete_data)
for i in range(0, len(heading)):

[ 35 ]

42 43 44 45 46 47 48 49 50 51 52