Page 123 - Data Science Algorithms in a Week
P. 123

Clustering into K Clusters


                # Clusters points into the k groups adding every stage
                # of the algorithm to the history which is returned.
                def cluster_with_history(points, k):
                    history = []
                    centroids = choose_init_centroids(points, k)
                    point_groups = points_to_point_groups(points)
                    while True:
                        point_groups = assign_groups(point_groups, centroids)
                        history.append((point_groups, centroids))
                        new_centroids = choose_centroids(point_groups, k)
                        done = True
                        for i in range(0, len(centroids)):
                            if centroids[i] != new_centroids[i]:
                                done = False
                                break
                        if done:
                            return history
                        centroids = new_centroids

                # Program start
                csv_file = sys.argv[1]
                k = int(sys.argv[2])
                everything = False
                # The third argument sys.argv[3] represents the number of the step of the
                # algorithm starting from 0 to be shown or "last" for displaying the last
                # step and the number of the steps.
                if sys.argv[3] == "last":
                    everything = True
                else:
                    step = int(sys.argv[3])

                data = common.csv_file_to_list(csv_file)
                points = data_to_points(data)  # Represent every data item by a point.
                history = cluster_with_history(points, k)
                if everything:
                    print "The total number of steps:", len(history)
                    print "The history of the algorithm:"
                    (point_groups, centroids) = history[len(history) - 1]
                    # Print all the history.
                    print_cluster_history(history)
                    # But display the situation graphically at the last step only.
                    draw(point_groups, centroids)
                else:
                    (point_groups, centroids) = history[step]
                    print "Data for the step number", step, ":"
                    print point_groups, centroids
                    draw(point_groups, centroids)



                                                    [ 111 ]
   118   119   120   121   122   123   124   125   126   127   128