Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new files #1

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Python Installed on PC preferably(Python 3.5 or Python 3)
Two .csv perceptual data files for example here they are flavornetPercepts.csv & superscentPercepts.csv

# For Python 2.7
Change urllib.request to urllib everywhere in the program
Change urllib.request to urllib everywhere in the program (As it was shifted in Python 3)

Remove encoding="utf8" everywhere in the program

Expand Down Expand Up @@ -57,3 +57,21 @@ The final output similarity comparing Superscent and Flavournet will be printed
Similarity[0,inf) in Flavournet Graphs is 23903599.9347
Similarity[0,inf) in Superscent Graphs is 38827186.0992


# For running newscript.py (Data extraction for Stem Words)

You Will need word.csv file

# STEM WORDS are words whose all kind of occurances we want to find like taint* will include taint,tainted,tainting......

Open Command Prompt

cd to the loaction of code and word.csv file

type "python newscript.py"

THE CODE WILL START

The errors and information will be logged in log.out file which will be automatically created in the same folder

The final output in JSON format for word will be in newjson.txt
121 changes: 121 additions & 0 deletions newscript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import urllib.request
import gzip
import csv
import json
from bson.objectid import ObjectId
import os
import logging

logging.basicConfig(filename="log.out", filemode='w', level=logging.INFO)
logger = logging.getLogger(__name__)
try:
file = open('word.csv', 'r', encoding="utf8") # Open file
read = csv.reader(file, delimiter=',')
except OSError as err:
logger.error("OS error: {0}".format(err))
except IOError as io:
logger.error("IO Erroe", io)

f = open('newjson.txt', 'w') # psuedo code to create file
f.close()

xp = [] # list of words to download
yp = [] # list of normal words
zp = [] # list of stem words

# Creating 2 lists
for row in read:
# print(row)
row[0] = row[0].strip()
# print(row)
if row[0][-1] == '*':
zp.extend(row) # Separating * words
else:
yp.extend(row)

# Creating a list to download
for z in yp:
z = z.strip()
if z[:2] not in xp:
xp.append(z[:2])

xp.sort()
# xp = ['ta']
print(xp)


# Class Encoder to dump dictionary in json
class Encoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, ObjectId):
return str(obj)
else:
return obj


# search function for matching stem words
def searchf(y, zp):
for x in zp:
if y[:len(x)-1] == x[:-1]:
return True
return False


for i in range(len(xp)):
target = open('newjson.txt', 'r')
logger.info('downloading '+xp[i])
try:
urllib.request.urlretrieve("http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-2gram-20120701-"+xp[i]+".gz", xp[i]+".gz")
except:
logger.error("Error Downloading "+xp[i])
continue
search = {}
# Reading search from newjson.txt
tar = target.read()
if len(tar) > 0:
search = json.loads(tar)
target.close()

for row in yp:
if row not in search.keys():
search[row] = {}
for row in zp:
if row[:-1] not in search.keys():
search[row[:-1]] = {}

logger.info('Opening '+xp[i])
# Opening .gz file and searching for words in yp or zp
with gzip.open(xp[i]+'.gz', 'rt', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
x = str(row[0])
y, z = x.split(' ')
if y.lower() in yp:
if z.lower() in yp or searchf(z.lower(), zp):
if z.lower() not in search[y.lower()].keys():
search[y.lower()][z.lower()] = 1
else:
search[y.lower()][z.lower()] += 1
else:
if searchf(y.lower(), zp):
if y.lower() not in search.keys():
# print(type(y), type(search))
'''if '_' in y.lower():
d, e = y.split("_")
y = d.lower()'''
search[y.lower()] = {}
logger.info('new key '+y.lower())
yp.append(y.lower())
if z.lower() in yp or searchf(z.lower(), zp):
search[y.lower()][z.lower()] = 1

target = open('newjson.txt', 'w')
jso = json.dumps(search, cls=Encoder) # dumping search
target.truncate()
target.write(jso) # writing search
target.close()
f.close()
logger.info('Removing '+xp[i])
os.remove(xp[i]+".gz")

file.close()
Loading