Fixed up py_mutations_hub.py

naturalness · Aug 1, 2017 · 3b30480 · 3b30480
1 parent 2739a3f
commit 3b30480
Show file tree

Hide file tree

Showing 6 changed files with 254 additions and 649 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+train_pre_data.txt
diff --git a/__pycache__/toCheck.pypy-41.pyc b/__pycache__/toCheck.pypy-41.pyc
diff --git a/keras_model.py b/keras_model.py
@@ -17,7 +17,7 @@
 from itertools import izip
 
 import os
-
+import cPickle
 import matplotlib.pyplot as plt
 
 # BATCH = 60
@@ -34,10 +34,11 @@
 BATCH_SIZE = 66
 
 
-def getInputTen():
-	one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub, _, _, _, _, _ = perform(0)
+def getInputTen(allTrainData):
+	#one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub, _, _, _, _, _ = perform(0)
+	one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub = allTrainData[0][0],allTrainData[0][1], allTrainData[0][2], allTrainData[0][3]
 	while(one_hot_good == 1):
-			one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub, _, _, _, _, _ = perform(0)
+			one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub = allTrainData[0][0],allTrainData[0][1], allTrainData[0][2], allTrainData[0][3]
 	#print type(one_hot_good)
 	#print one_hot_good
 	windowInd = 0
@@ -150,10 +151,11 @@ def getInputTen():
 				#print "FILE IND"
 				print fileInd
 				windowInd = 0
-				one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub, _, _, _, _, _ = perform(fileInd)
+				#one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub, _, _, _, _, _ = perform(fileInd)
+				one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub = allTrainData[fileInd][0],allTrainData[fileInd][1], allTrainData[fileInd][2], allTrainData[fileInd][3]
 				while(one_hot_good == 1):
 					fileInd+=1
-					one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub, _, _, _, _, _ = perform(fileInd)
+					one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub = allTrainData[fileInd][0],allTrainData[fileInd][1], allTrainData[fileInd][2], allTrainData[fileInd][3]
 
 
 				for p in range(numGoodLeft):
@@ -208,10 +210,13 @@ def getInputTen():
 			one_hot_bad_sub.insert(p, old_one_hot_bad_sub[len(old_one_hot_bad_sub)-numBadSubLeft+p])
 		'''
 
-def getOutputTen():
-	_, _, _, _, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, _ = perform(0)
+def getOutputTen(allTrainData):
+	#_, _, _, _, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, _ = perform(0)
+	one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out = allTrainData[0][4],allTrainData[0][5], allTrainData[0][6], allTrainData[0][7]
+
 	while(one_hot_good_out == 1):
-			_, _, _, _, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, _ = perform(0)
+			#_, _, _, _, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, _ = perform(0)
+			one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out = allTrainData[0][4],allTrainData[0][5], allTrainData[0][6], allTrainData[0][7]
 	#print type(one_hot_good_out)
 	#print one_hot_good_out
 	windowInd = 0
@@ -382,10 +387,12 @@ def getOutputTen():
 
 				fileInd += 1
 				windowInd = 0
-				_, _, _, _, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, _ = perform(fileInd)
+				#_, _, _, _, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, _ = perform(fileInd)
+				one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out = allTrainData[fileInd][4],allTrainData[fileInd][5], allTrainData[fileInd][6], allTrainData[fileInd][7]
 				while(one_hot_good_out == 1):
 					fileInd+=1
-					_, _, _, _, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, _ = perform(fileInd)
+					#_, _, _, _, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, _ = perform(fileInd)
+					one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out = allTrainData[fileInd][4],allTrainData[fileInd][5], allTrainData[fileInd][6], allTrainData[fileInd][7]
 
 				#for p in range(numGoodOutLeft):
 				#	one_hot_good_out.insert(p, old_one_hot_good_out[len(old_one_hot_good_out)-numGoodOutLeft+p])
@@ -945,8 +952,14 @@ def initData():
 	print "Start..."
 
 	#main_input = Input(shape=(10,87), dtype='int32', name='main_input')
+	allTrainData = cPickle.load( open( "train_pre_data.txt", "rb" ) )
+	print "GOT DATA"
 
-
+	sum = 0
+	for x in allTrainData:
+		sum += len(x[2])
+	print sum
+	print "SUM"
 	model = Sequential()
 	model.add(Dense(4, activation='relu', input_shape=(10, 88), batch_size=66))
 	model.add(Dropout(0.5))
@@ -986,10 +999,10 @@ def initData():
 	# SECOnD THOUSAND TOKENS: 1353925
 
 	history = model.fit_generator(
-               	izip(getInputTen(), getOutputTen()),
+               	izip(getInputTen(allTrainData), getOutputTen(allTrainData)),
                 steps_per_epoch=16521,
-		validation_data=izip(getInputValTen(), getOutputValTen()),
-		validation_steps=20513,
+		#validation_data=izip(getInputValTen(), getOutputValTen()),
+		#validation_steps=20513,
                 epochs=5,  
                 verbose=2	
             )

diff --git a/py_mutations_hub.py b/py_mutations_hub.py
@@ -18,6 +18,7 @@
 from mutate_token_sub import subTokMutS
 import sys
 import cPickle
+from scipy import sparse
 
 # NUM TOTAL: 462 563
 # ACTUAl: 462 540
@@ -49,7 +50,9 @@
 START_TOKEN = '<s>'
 END_TOKEN = '</s>'
 
-def one_hot(indexed_tokens):
+def one_hot_a(indexed_tokens):
+	print indexed_tokens
+	print len(indexed_tokens)
 	one_hot = []
 	nb_classes = 88
 	one_hot_targets = np.eye(nb_classes)[indexed_tokens]
@@ -62,6 +65,9 @@ def one_hot(indexed_tokens):
 	#one_hot.astype(int)
 	#print type(one_hot[0][0])
 	return one_hot
+
+def one_hot(indexed_tokens):
+	return indexed_tokens
 
 
 def set_from_json(all_tokens, flag):
@@ -318,13 +324,14 @@ def perform(curr):
 			#print len(all_tokens)
 			#print len(allGood)
 			one_hot_good = vocabularize_tokens(all_tokens, False)
-			one_hot_good_out = []
-			for x in range(len(all_tokens)+(WINDOW_SIZE-1)+(WINDOW_SIZE-1)):
-				toAdd = []
-				toAdd = [0] * NUM_BITS_OUTPUT
-				toAdd[0] = 0
-				toAdd[1] = 1 # FIRST BIT (01) - INDICATE NO ERROR (1 because rest are 0 and so add up to 1)
-				one_hot_good_out.append(toAdd)
+			#one_hot_good_out = []
+			#for x in range(len(all_tokens)+(WINDOW_SIZE-1)+(WINDOW_SIZE-1)):
+				#toAdd = []
+				#toAdd = [0] * NUM_BITS_OUTPUT
+				#toAdd[0] = 0
+				#toAdd[1] = 1 # FIRST BIT (01) - INDICATE NO ERROR (1 because rest are 0 and so add up to 1)
+				#toAdd = [1]
+			#one_hot_good_out.append(1)
 
 
 			#print "DHVANI"
@@ -412,42 +419,45 @@ def perform(curr):
 
 
 			#print "NEXT STEP...C"
-			passInsErrorInd = (bruhInd+1)+(WINDOW_SIZE-1) 
+			#passInsErrorInd = (bruhInd+1)+(WINDOW_SIZE-1) 
 
-			one_hot_bad_ins_out = []
-			trueErrorInd = (bruhInd+1)+(WINDOW_SIZE-1) 
+			#one_hot_bad_ins_out = []
+			#trueErrorInd = (bruhInd+1)+(WINDOW_SIZE-1) 
 
 			# INSERT OUT_PUT
 
-			iterNum = len(new_tokens_ins)+(WINDOW_SIZE-1)+(WINDOW_SIZE-1)
+			#iterNum = len(new_tokens_ins)+(WINDOW_SIZE-1)+(WINDOW_SIZE-1)
 			#print "divide"
 			#print trueErrorInd
 			#print iterNum
+			'''
 			for x in range(iterNum):
 				#if x <= trueErrorInd <= (x+trueErrorInd):
 				#if x <= trueErrorInd <= x+(WINDOW_SIZE-1):
 				if True:
 					# DIFF - ACTUAL ERROR
 					#print x
-					toAdd = []
-					toAdd = [0] * NUM_BITS_OUTPUT
-					toAdd[0] = 1 # FIRST BIT (10) - INDICATE ERROR 
-					toAdd[1] = 0
-					if NO_TOKEN != None:
-						toAdd[2] = 0
-						toAdd[3] = 1
-					if INSERTION != None:
-						toAdd[4] = 0
-						toAdd[5] = 0
-						toAdd[6] = 1
-					toAdd[7] = 1
+					#toAdd = []
+					#toAdd = [0] * NUM_BITS_OUTPUT
+					#toAdd[0] = 1 # FIRST BIT (10) - INDICATE ERROR 
+					#toAdd[1] = 0
+					#if NO_TOKEN != None:
+					#	toAdd[2] = 0
+					#	toAdd[3] = 1
+					#if INSERTION != None:
+					#	toAdd[4] = 0
+					#	toAdd[5] = 0
+					#	toAdd[6] = 1
+					#toAdd[7] = 1
+					toAdd = [0,3,6,7]
 					one_hot_bad_ins_out.append(toAdd)
 				else:
 					toAdd = []
 					toAdd = [0] * NUM_BITS_OUTPUT
 					toAdd[0] = 1
 					toAdd[1] = 0 # FIRST BIT (01) - INDICATE NO ERROR (1 because rest are 0 and so add up to 1)
 					one_hot_bad_ins_out.append(toAdd)
+			'''
 			#print "Morning"	
 			#print len(new_tokens_ins)
 			#print len(one_hot_bad_ins_out)
@@ -508,11 +518,11 @@ def perform(curr):
 
 			one_hot_bad_del = vocabularize_tokens(new_tokens_del, True)
 
-			one_hot_bad_del_out = []
-			trueErrorInd = (bruhInd)+(WINDOW_SIZE-1)
+			#one_hot_bad_del_out = []
+			#trueErrorInd = (bruhInd)+(WINDOW_SIZE-1)
 
 			# DELETE OUT_PUT
-			iterNum = len(new_tokens_del)+(WINDOW_SIZE-1)+(WINDOW_SIZE-1)
+			#iterNum = len(new_tokens_del)+(WINDOW_SIZE-1)+(WINDOW_SIZE-1)
 			#print "divide"
 			#print len(send)
 			#print trueErrorInd
@@ -523,31 +533,34 @@ def perform(curr):
 			#oneH_ind_deleted = set_from_json_nonarr(send, True)
 			#print oneH_ind_deleted
 			#print "rad"
+			'''
 			for x in range(iterNum):
 				#if x <= trueErrorInd <= (x+trueErrorInd):
 				if True:
 					# DIFF - ACTUAL ERROR
 					#print x
-					toAdd = []
-					toAdd = [0] * NUM_BITS_OUTPUT
-					toAdd[0] = 1 # FIRST BIT (10) - INDICATE ERROR 
-					toAdd[1] = 0
-					if YES_TOKEN != None:
-						toAdd[2] = 1
-						toAdd[3] = 0
-					if DELETION != None:
-						toAdd[4] = 0
-						toAdd[5] = 1
-						toAdd[6] = 0
-					toAdd[7] = 1
-					toAdd[17] = 1
+					#toAdd = []
+					#toAdd = [0] * NUM_BITS_OUTPUT
+					#toAdd[0] = 1 # FIRST BIT (10) - INDICATE ERROR 
+					#toAdd[1] = 0
+					#if YES_TOKEN != None:
+					#	toAdd[2] = 1
+					#	toAdd[3] = 0
+					#if DELETION != None:
+					#	toAdd[4] = 0
+					#	toAdd[5] = 1
+					#	toAdd[6] = 0
+					#toAdd[7] = 1
+					#toAdd[17] = 1
+					toAdd = [0,2,5,7,17]
 					one_hot_bad_del_out.append(toAdd)
 				else:
 					toAdd = []
 					toAdd = [0] * NUM_BITS_OUTPUT
 					toAdd[0] = 0
 					toAdd[1] = 1 # FIRST BIT (01) - INDICATE NO ERROR (1 because rest are 0 and so add up to 1)
 					one_hot_bad_del_out.append(toAdd)
+			'''
 			#print "Morning"	
 			#print len(allGood)
 			#print len(one_hot_bad_del_out)
@@ -624,10 +637,10 @@ def perform(curr):
 
 			one_hot_bad_sub = vocabularize_tokens(new_tokens_sub, True)
 
-			one_hot_bad_sub_out = []
-			trueErrorInd = (bruhInd)+(WINDOW_SIZE-1) 
+			#one_hot_bad_sub_out = []
+			#trueErrorInd = (bruhInd)+(WINDOW_SIZE-1) 
 			# SUB OUT_PUT
-			iterNum = len(new_tokens_sub)+(WINDOW_SIZE-1)+(WINDOW_SIZE-1)
+			#iterNum = len(new_tokens_sub)+(WINDOW_SIZE-1)+(WINDOW_SIZE-1)
 			#print "divide"
 			#print len(send)
 			#print trueErrorInd
@@ -638,33 +651,38 @@ def perform(curr):
 			#oneH_sub_switch = set_from_json_nonarr(sendS, True)
 			#print oneH_sub_switch
 			#print "rad"
+
+			'''
 			for x in range(iterNum):
 				#if x <= trueErrorInd <= (x+trueErrorInd):
 				#if x <= trueErrorInd <= x+(WINDOW_SIZE-1):
 				if True:
 					# DIFF - ACTUAL ERROR
 					#print x
-					toAdd = []
-					toAdd = [0] * NUM_BITS_OUTPUT
-					toAdd[0] = 1 # FIRST BIT (10) - INDICATE ERROR 
-					toAdd[1] = 0
+					#toAdd = []
+					#toAdd = [0] * NUM_BITS_OUTPUT
+					#toAdd[0] = 1 # FIRST BIT (10) - INDICATE ERROR 
+					#toAdd[1] = 0
 					
-					toAdd[2] = 1
-					toAdd[3] = 0
+					#toAdd[2] = 1
+					#toAdd[3] = 0
 					
-					toAdd[4] = 1
-					toAdd[5] = 0
-					toAdd[6] = 0
+					#toAdd[4] = 1
+					#toAdd[5] = 0
+					#toAdd[6] = 0
 
-					toAdd[7] = 1
-					toAdd[17] = 1
+					#toAdd[7] = 1
+					#toAdd[17] = 1
+					toAdd = [0,2,4,7,17]
 					one_hot_bad_sub_out.append(toAdd)
 				else:
 					toAdd = []
 					toAdd = [0] * NUM_BITS_OUTPUT
 					toAdd[0] = 0
 					toAdd[1] = 1 # FIRST BIT (01) - INDICATE NO ERROR (1 because rest are 0 and so add up to 1)
 					one_hot_bad_sub_out.append(toAdd)
+
+			'''
 			#print "Morning"	
 			#print len(allGood)
 			#print len(all_tokens)
@@ -704,11 +722,20 @@ def perform(curr):
 			#sizes = [len(one_hot_good), len(one_hot_bad_ins),len(one_hot_bad_del),len(one_hot_bad_sub)]
 			#minSize = int(min(float(siz) for siz in sizes)) # min of a generator
 			#return minSize
-
 
-
-			toPass = [one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub, one_hot_good_out, one_hot_bad_ins_out, one_hot_bad_del_out, one_hot_bad_sub_out, passInsErrorInd]
-
+			#toPassGood = []
+			#print len(one_hot_good)
+			#for good in one_hot_good:
+			#	ind = good.index(1.0)
+			#	toPassGood.append(ind)
+			#print len(toPassGood)
+			#print one_hot_bad_sub_out
+			#print type(radha)	
+
+			toPass = [one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub]
+			#toPass = sparse.csr_matrix(toPassMatrix)
+			#print toPass
+			#print type(radha)
 
 			#cPickle.dump(one_hot_bad_ins, fileStore)
 			#cPickle.dump(one_hot_bad_del, fileStore)
@@ -738,16 +765,26 @@ def giveItems():
 	#print allData[3][8]
 
 if __name__ == '__main__':
-    all_data_to_pass = []
-    for x in range(2001):
+    all_train_to_pass = []
+    for x in range(1000):
 	print x
 	if x != -1:
 		toP = perform(x)
-		all_data_to_pass.append(toP)
-    fileStore = open("all_pre_data.txt", 'w')
-    cPickle.dump(all_data_to_pass, fileStore)
+		all_train_to_pass.append(toP)
+    fileStore = open("train_pre_data.txt", 'w')
+    cPickle.dump(all_train_to_pass, fileStore)
     fileStore.close()
-    giveItems()
+    all_val_to_pass = []
+    for x in range(1000):
+	print x
+	if x != -1:
+		toPV = perform(x+1001)
+		all_val_to_pass.append(toPV)
+    fileStoreV = open("val_pre_data.txt", 'w')
+    cPickle.dump(all_val_to_pass, fileStoreV)
+    fileStoreV.close()
+    print "FINISHED"
+    #giveItems()
 
     sys.exit()
     for x in range(10):

diff --git a/py_mutations_hub.pyc b/py_mutations_hub.pyc