MolecularOdorRecognition · rijuldhir · Jun 21, 2017 · Jul 7, 2017 · Jul 7, 2017 · Oct 22, 2018
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Python Installed on PC preferably(Python 3.5 or Python 3)
 Two .csv perceptual data files for example here they are flavornetPercepts.csv & superscentPercepts.csv
 
 # For Python 2.7
-Change urllib.request to urllib everywhere in the program
+Change urllib.request to urllib everywhere in the program (As it was shifted in Python 3)
 
 Remove encoding="utf8" everywhere in the program
 
@@ -57,3 +57,21 @@ The final output similarity comparing Superscent and Flavournet will be printed
 Similarity[0,inf) in Flavournet Graphs is  23903599.9347
 Similarity[0,inf) in Superscent Graphs is  38827186.0992
 
+
+# For running newscript.py (Data extraction for Stem Words)
+
+You Will need word.csv file
+
+# STEM WORDS are words whose all kind of occurances we want to find like taint* will include taint,tainted,tainting......
+
+Open Command Prompt
+
+cd to the loaction of code and word.csv file
+
+type "python newscript.py"
+
+THE CODE WILL START
+
+The errors and information will be logged in log.out file which will be automatically created in the same folder
+
+The final output in JSON format for word will be in newjson.txt 
diff --git a/newscript.py b/newscript.py
@@ -0,0 +1,121 @@
+import urllib.request
+import gzip
+import csv
+import json
+from bson.objectid import ObjectId
+import os
+import logging
+
+logging.basicConfig(filename="log.out", filemode='w', level=logging.INFO)
+logger = logging.getLogger(__name__)
+try:
+    file = open('word.csv', 'r', encoding="utf8")  # Open file
+    read = csv.reader(file, delimiter=',')
+except OSError as err:
+    logger.error("OS error: {0}".format(err))
+except IOError as io:
+    logger.error("IO Erroe", io)
+
+f = open('newjson.txt', 'w')  # psuedo code to create file
+f.close()
+
+xp = []  # list of words to download
+yp = []  # list of normal words
+zp = []  # list of stem words
+
+# Creating 2 lists
+for row in read:
+    # print(row)
+    row[0] = row[0].strip()
+    # print(row)
+    if row[0][-1] == '*':
+        zp.extend(row)          # Separating * words
+    else:
+        yp.extend(row)
+
+# Creating a list to download
+for z in yp:
+    z = z.strip()
+    if z[:2] not in xp:
+        xp.append(z[:2])
+
+xp.sort()
+# xp = ['ta']
+print(xp)
+
+
+# Class Encoder to dump dictionary in json
+class Encoder(json.JSONEncoder):
+        def default(self, obj):
+            if isinstance(obj, ObjectId):
+                return str(obj)
+            else:
+                return obj
+
+
+# search function for matching stem words
+def searchf(y, zp):
+    for x in zp:
+        if y[:len(x)-1] == x[:-1]:
+            return True
+    return False
+
+
+for i in range(len(xp)):
+    target = open('newjson.txt', 'r')
+    logger.info('downloading '+xp[i])
+    try:
+        urllib.request.urlretrieve("http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-2gram-20120701-"+xp[i]+".gz", xp[i]+".gz")
+    except:
+        logger.error("Error Downloading "+xp[i])
+        continue
+    search = {}
+    # Reading search from newjson.txt
+    tar = target.read()
+    if len(tar) > 0:
+        search = json.loads(tar)
+    target.close()
+
+    for row in yp:
+        if row not in search.keys():
+                search[row] = {}
+    for row in zp:
+        if row[:-1] not in search.keys():
+                search[row[:-1]] = {}
+
+    logger.info('Opening '+xp[i])
+    # Opening .gz file and searching for words in yp or zp
+    with gzip.open(xp[i]+'.gz', 'rt', encoding='utf-8') as f:
+        reader = csv.reader(f, delimiter='\t')
+        for row in reader:
+            x = str(row[0])
+            y, z = x.split(' ')
+            if y.lower() in yp:
+                    if z.lower() in yp or searchf(z.lower(), zp):
+                        if z.lower() not in search[y.lower()].keys():
+                            search[y.lower()][z.lower()] = 1
+                        else:
+                            search[y.lower()][z.lower()] += 1
+            else:
+                if searchf(y.lower(), zp):
+                    if y.lower() not in search.keys():
+                            # print(type(y), type(search))
+                            '''if '_' in y.lower():
+                                d, e = y.split("_")
+                                y = d.lower()'''
+                            search[y.lower()] = {}
+                            logger.info('new key '+y.lower())
+                            yp.append(y.lower())
+                            if z.lower() in yp or searchf(z.lower(), zp):
+                                search[y.lower()][z.lower()] = 1
+
+    target = open('newjson.txt', 'w')
+    jso = json.dumps(search, cls=Encoder)  # dumping search
+    target.truncate()
+    target.write(jso)  # writing search
+    target.close()
+    f.close()
+    logger.info('Removing '+xp[i])
+    os.remove(xp[i]+".gz")
+
+file.close()