MPI method for Apriori

jiteshjha · jiteshjha · commit 080852f1b2e4 · 2017-03-27T20:25:23.000+05:30
diff --git a/apriori_mpi.py b/apriori_mpi.py
@@ -2,10 +2,18 @@
 import sys
 import operator
 import time
+from math import floor
+from mpi4py import MPI
+
+from os import getcwd, walk, system, path
+
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
 
 start_time = time.clock()
 
-def find_frequent_1_itemsets(D, min_sup, row_count):
+def find_frequent_1_itemsets(D, min_sup):
 
     dataset = None
     itemset = {}
@@ -26,7 +34,7 @@ def find_frequent_1_itemsets(D, min_sup, row_count):
         Calculate frequent itemsets
     """
     for item in itemset.copy():
-        if itemset[item]/float(row_count) < min_sup:
+        if itemset[item] < min_sup:
             itemset.pop(item, None)
 
     return sorted(itemset.items(), key=operator.itemgetter(0))
@@ -61,15 +69,15 @@ def has_frequent_subset(c, L, k):
 
     return True
 
-def apriori_gen(L, k, row_count):
+def apriori_gen(L, k):
 
     C = []
     for l1 in L:
         for l2 in L:
 
             first_itemlist = l1[0].split(",")
             second_itemlist = l2[0].split(",")
-            #print first_itemlist, second_itemlist
+
             i = 0
             flag = True
             while i <= k-2-1:
@@ -111,15 +119,14 @@ def generate_association_rules(itemset, min_conf, row_count):
                         if item_sup is not None and pair[1]/float(item_sup) >= min_conf:
                                 print ",".join(item), "=>", ",".join(list(set(pair[0].split(',')) - set(item))), "Support: ", float("{0:.2f}".format(float(item_sup)/row_count))*100, "%", "Confidence: ", float("{0:.2f}".format(pair[1]/float(item_sup)*100)), "%"
 
-def main():
+def main(D):
 
     """
     Input: D, a dataset of transaction
            min_sup, the minimum support count threshold
            min_conf, the minimum confidence threshold
     """
 
-    D = str(sys.argv[1])
     min_sup = float(sys.argv[2])
     min_conf = float(sys.argv[3])
 
@@ -129,7 +136,10 @@ def main():
         dataset = csv.reader(f)
         row_count = sum(1 for row in dataset)
 
-    L1 = find_frequent_1_itemsets(D, min_sup, row_count)
+    min_sup = min_sup * row_count
+    min_conf = min_conf * row_count
+
+    L1 = find_frequent_1_itemsets(D, min_sup)
     itemset = [L1]
 
 
@@ -140,10 +150,10 @@ def main():
         if not itemset[k-2]:
             break
 
-        C = apriori_gen(itemset[k-2], k, row_count)
+        C = apriori_gen(itemset[k-2], k)
         L = {}
 
-
+        
         with open(D, 'rb') as f:
             dataset = csv.reader(f)
             for t in dataset:
@@ -153,25 +163,122 @@ def main():
                             L[c] += 1
                         else:
                             L[c] = 1
-
+        
         for item in L.copy():
-            if L[item]/float(row_count) < min_sup:
+            if L[item] < min_sup:
                 L.pop(item, None)
 
         itemset.append(sorted(L.items(), key=operator.itemgetter(0)))
         k += 1
 
     itemset.pop()
-    generate_association_rules(itemset, min_conf, row_count)
-    print "\nResultant Item sets:"
 
-    for k in range(1, len(itemset)):
-        print "\n", k, "-itemsets:\n"
-        for item in itemset[k]:
-            print item[0], "| Support ", float("{0:.2f}".format(item[1]/float(row_count)))*100, "%"
+    return itemset
 
 
 if __name__ == "__main__":
-    main()
 
-print "\nProgram Execution Time: ",time.clock() - start_time, " seconds"
+    onlyfiles = []
+
+    if rank == 0:
+
+        """
+            Make a directory called "temp"
+            to split given dataset with the number of processes
+        """
+
+        system("mkdir temp")
+        dataset = str(sys.argv[1])
+        num_process = comm.Get_size()
+        file_size = int(floor(path.getsize(dataset)/(float(1000000) * num_process)))
+        system("split --bytes=" + str(file_size)+"M " + dataset + " temp/retail")
+
+
+        # Get current working directory
+        cwd = getcwd()
+
+        """
+         Get list of files
+        """
+
+        for (dirpath, dirnames, filenames) in walk(cwd+"/temp"):
+            onlyfiles.extend(filenames)
+            break
+        
+    # Get the dataset partition name
+    dataset = comm.scatter(onlyfiles, root=0)
+    
+    # Generate local frequent itemsets
+    itemset = main("temp/"+dataset)
+
+    # Root process collects all the local frequent itemsets
+    set_itemsets = comm.gather(itemset, root=0)
+
+    if rank == 0:
+
+        """
+            Merge all the local frequent itemset gathered from processes according to their size
+        """
+
+        itemsetsi = []
+
+        max_itemsets_length = max([len(itemsets) for itemsets in set_itemsets])
+
+        for i in xrange(0, max_itemsets_length):
+            iset = set()
+
+            for j in xrange(0, num_process):
+                temp_set = []
+                if(i <= (len(set_itemsets[j])-1)):
+                    for item in set_itemsets[j][i]:
+                        temp_set.append(list(item)[0])
+                    
+                    iset = iset.union(list(temp_set))
+
+            itemsetsi.append(dict((k,0) for k in list(iset)))
+        
+        # Remove the non-empty temp directory
+        system("rm -rf temp")
+
+        # Get the original dataset name
+        D = str(sys.argv[1])
+
+        # Find candidate global frequent itemsets
+        row_count = 0
+        with open(D, 'rb') as f:
+            dataset = csv.reader(f)
+            for t in dataset:
+                for itemset in itemsetsi:
+                    for item in itemset:
+                        if set(item.split(",")).issubset(set(t)):
+                            itemset[item] += 1
+                row_count += 1
+
+        # Remove non-frequent global itemsets
+        min_sup = float(sys.argv[2])
+        for itemset in itemsetsi:
+            for item in itemset.copy():
+                if (itemset[item]/float(row_count)) < min_sup:
+                    itemset.pop(item, None)
+
+        # Display Itemsets    
+        print "\nResultant Item sets:"
+
+        k = 1
+        for itemset in itemsetsi:
+            if bool(itemset):
+                print "\n", k, "-itemsets:\n"
+                k += 1
+                for item in itemset:
+                    print item, "| Support ", float("{0:.2f}".format(itemset[item]/float(row_count)))*100, "%"
+
+        # Convert list of dictionaries into multi-dimensional list
+        list_itemsets = [(sorted(itemset.items(), key=operator.itemgetter(0))) for itemset in itemsetsi if bool(itemset)]
+        
+        # Get minimum confidence
+        min_conf = float(sys.argv[3])
+
+        # Generate association rules
+        generate_association_rules(list_itemsets, min_conf, row_count)
+
+print "\nRank : ",rank, " - Program Execution Time: ",time.clock() - start_time, " seconds"