Skip to content

Commit

Permalink
Add MLP stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
Dhvani Patel committed Jul 13, 2017
1 parent 4162624 commit adee584
Show file tree
Hide file tree
Showing 9 changed files with 177 additions and 45 deletions.
28 changes: 28 additions & 0 deletions Untitled Document
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright 2017 Dhvani Patel

from keras.models import Sequential
from keras.layers import Dense, Dropout
import numpy
from Token import Token
from py_mutations_hub import perform

# BATCH = 60
# So 15 of 4 of one window
# One 4:
# [Good, Insert, Delete, Sub]
# WINDOW SIZE = 10

def create_batches():
one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub = perform()
print "Finished..."

#return train_input, train_output

def initData():
print "Start..."
create_batches()
#train_input, train_output = create_batches()


if __name__ == '__main__':
initData()
Binary file modified __pycache__/toCheck.pypy-41.pyc
Binary file not shown.
93 changes: 69 additions & 24 deletions keras_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,86 @@
import numpy
from Token import Token
from py_mutations_hub import perform
from skimage.io import imread
from skimage.transform import resize
import numpy as np
from itertools import izip_longest

# BATCH = 60
# So 15 of 4 of one window
# One 4:
# [Good, Insert, Delete, Sub]
# WINDOW SIZE = 10

def chunker(seq, size):
return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))

class feedData():

def __init__(self, x_set, y_set, batch_size):
self.X,self.y = x_set,y_set
self.batch_size = batch_size

def __len__(self):
return len(self.X) // self.batch_size

def __getitem__(self,idx):
batch_x = self.X[idx*self.batch_size:(idx+1)*self.batch_size]
batch_y = self.y[idx*self.batch_size:(idx+1)*self.batch_size]
return np.array([batch_x]), np.array(batch_y)


def create_batches():
# Copyright 2016, 2017 Eddie Antonio Santos <[email protected]>
"""
Return a tuple of infinite training and validation examples,
respectively.
"""
training = LoopBatchesEndlessly(
filehashes=self.training_set,
vectors_path=self.vectors_path,
batch_size=self.batch_size,
context_length=self.context_length,
backwards=self.backwards
)
validation = LoopBatchesEndlessly(
filehashes=self.validation_set,
vectors_path=self.vectors_path,
batch_size=self.batch_size,
context_length=self.context_length,
backwards=self.backwards
)
return training, validation
one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub = perform()
print "Finished..."

ohg_g = chunker(one_hot_good, 10)
ohbi_g = chunker(one_hot_bad_ins, 10)
ohbd_g = chunker(one_hot_bad_del, 10)
ohbs_g = chunker(one_hot_bad_sub, 10)

ohg_group = []
for rad in ohg_g:
ohg_group.append(rad)

ohbi_group = []
for rad in ohbi_g:
ohbi_group.append(rad)

ohbd_group = []
for rad in ohbd_g:
ohbd_group.append(rad)

ohbs_group = []
for rad in ohbs_g:
ohbs_group.append(rad)

print len(ohg_group)
print len(ohbi_group)
print len(ohbd_group[53])
#print ohbd_group[53]
print len(ohbs_group)

goodA = np.array(ohg_group)
insA = np.array(ohbi_group)
delA = np.array(ohbd_group)
subA = np.array(ohbs_group)

temp = np.insert(subA, np.arange(len(delA)), delA)
temp2 = np.insert(temp, np.arange(len(insA)), insA)
train_input = np.insert(temp2, np.arange(len(goodA)), goodA)

# feedData(train_input

#return train_input, train_output

def initData():
print "Start..."
one_hot_all = perform()
print len(one_hot_all)

train_input, train_output = create_batches()
create_batches()
model = Sequential()
# Fit the model
#model.fit(iter(train_input), iter(train_output), epochs=150, batch_size=10)
#train_input, train_output = create_batches()


if __name__ == '__main__':
Expand Down
11 changes: 4 additions & 7 deletions mutate_token_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def deleteTokMut(raw_tokens, raw_text):
if toTest == None:
print "Try again..."
deleteTokMut(raw_tokens_pass, raw_text)
return new_text, YES_TOKEN, DELETION, out_tokens_loc, send
else:
print toTest[0]
print toTest[0].filename
Expand All @@ -135,12 +136,8 @@ def deleteTokMut(raw_tokens, raw_text):
print toTest[0].functionname
print toTest[0].text
print toTest[0].errorname
print "-----------FINISHED-------------------"
print chosenLineInd+1
print out_tokens_loc
return new_text, YES_TOKEN, DELETION, out_tokens_loc, send

print "-----------FINISHED-------------------"
print chosenLineInd+1
print out_tokens_loc
print len(raw_tokens_pass)
print len(out_tokens_loc)
print lenD

Binary file modified mutate_token_delete.pyc
Binary file not shown.
1 change: 1 addition & 0 deletions mutate_token_sub.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def subTokMut(raw_tokens, raw_text):
if toTest == None:
print "Try again..."
subTokMut(raw_tokens_pass, raw_text)
return new_text, YES_TOKEN, SUBSTITUTION, out_tokens_loc, send
else:
print toTest[0]
print toTest[0].filename
Expand Down
Binary file modified mutate_token_sub.pyc
Binary file not shown.
89 changes: 75 additions & 14 deletions py_mutations_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@
# ONE HOT = 87

BATCH_SIZE = 66
EPOCHS = 14017
all_tokens = []
new_tokens = []
indexed_tokens = []
global all_tokens
new_tokens_ins = []
new_tokens_del = []
new_tokens_sub = []
global indexed_tokens
data = None

def one_hot(indexed_tokens):
Expand All @@ -46,6 +47,7 @@ def set_from_json(all_tokens):
#pprint(data)
for token in all_tokens:
toCompare = token.value
global indexed_tokens
indexed_tokens.append(data["indexes"].index(toCompare))
print indexed_tokens
return one_hot(indexed_tokens)
Expand Down Expand Up @@ -141,6 +143,7 @@ def handle_token(type, token, (srow, scol), (erow, ecol), line):
else:
val = repr(token)[1:len(repr(token))-1]
send = Token(tokenize.tok_name[type], val, srow, scol, erow, ecol, line)
global all_tokens
all_tokens.append(send)
print "%d,%d-%d,%d:\t%s\t%s" % \
(srow, scol, erow, ecol, tokenize.tok_name[type], repr(token))
Expand All @@ -165,26 +168,81 @@ def perform():
print "CURRENT: "
print curr
if toTest == None:
global all_tokens
all_tokens = []
global indexed_tokens
indexed_tokens = []
tokenStream = tokenize.tokenize(StringIO.StringIO(all_rows[curr][0]).readline, handle_token)
print "RAW"
print "RAW"
print len(all_tokens)

one_hot_good = vocabularize_tokens(all_tokens)
print "DHVANI"
print len(one_hot_good)

raw_tokens = tokenize.generate_tokens(StringIO.StringIO(all_rows[curr][0]).readline)
source_code = str(all_rows[curr][0])

#MUTATIONS PER TOKEN
new_text, NO_TOKEN, INSERTION, out_tokens_loc = insertTokMut(raw_tokens, source_code)

# INSERT
global all_tokens
all_tokens = []
global indexed_tokens
indexed_tokens = []
print "RAW"
print len(all_tokens)
new_i_text, NO_TOKEN, INSERTION, out_tokens_loc_i = insertTokMut(raw_tokens, source_code)
print "NEXT STEP...C"
#print len(new_i_text)
#print len(source_code)
try:
newTokenStream = tokenize.tokenize(StringIO.StringIO(new_i_text).readline, handle_token)
except tokenize.TokenError:
pass
new_tokens_ins = all_tokens
print len(new_tokens_ins)
print "CC"
one_hot_bad_ins = vocabularize_tokens(new_tokens_ins)


# DELETE
raw_tokens = tokenize.generate_tokens(StringIO.StringIO(all_rows[curr][0]).readline)
global all_tokens
all_tokens = []
global indexed_tokens
indexed_tokens = []
print type(raw_tokens)
print type(source_code)
new_d_text, YES_TOKEN, DELETION, out_tokens_loc_d, sendD = deleteTokMut(raw_tokens, source_code)


print "NEXT STEP..."
try:
newTokenStream = tokenize.tokenize(StringIO.StringIO(new_text).readline, handle_token)
newTokenStream = tokenize.tokenize(StringIO.StringIO(new_d_text).readline, handle_token)
except tokenize.TokenError:
pass
one_hot_bad = vocabularize_tokens(new_tokens)
new_tokens_del = all_tokens
one_hot_bad_del = vocabularize_tokens(new_tokens_del)


# SUB
raw_tokens = tokenize.generate_tokens(StringIO.StringIO(all_rows[curr][0]).readline)
global all_tokens
all_tokens = []
global indexed_tokens
indexed_tokens = []
print type(raw_tokens)

#deleteTokMut(raw_tokens, source_code)
#subTokMut(raw_tokens, source_code)
new_s_text, YES_TOKEN, SUBSTITUTION, out_tokens_loc_s, sendS = subTokMut(raw_tokens, source_code)

print "NEXT STEP..."
try:
newTokenStream = tokenize.tokenize(StringIO.StringIO(new_s_text).readline, handle_token)
except (tokenize.TokenError, IndentationError) as e:
pass
new_tokens_sub = all_tokens
one_hot_bad_sub = vocabularize_tokens(new_tokens_sub)

# MUTATIONS PER CHARACTER
# insertMut(source_code)
Expand All @@ -195,15 +253,18 @@ def perform():
#print one_hot_bad[0]

print len(one_hot_good)
print len(one_hot_bad)
print len(one_hot_bad_ins)
print len(one_hot_bad_del)
print len(one_hot_bad_sub)


one_hot_all = np.concatenate((one_hot_good, one_hot_bad), axis=0)
#one_hot_all = np.concatenate((one_hot_good, one_hot_bad), axis=0)

#print len(one_hot_all)
#print one_hot_all[538]

print "SUCCESS"
return one_hot_all
return one_hot_good, one_hot_bad_ins, one_hot_bad_del, one_hot_bad_sub

else:
print "Try again..."
Expand Down
Binary file modified py_mutations_hub.pyc
Binary file not shown.

0 comments on commit adee584

Please sign in to comment.