Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process_data.py run error, display memoryError, I use win10, 8G memory, how to solve? #15

Open
ICfree opened this issue Nov 13, 2018 · 6 comments

Comments

@ICfree
Copy link

ICfree commented Nov 13, 2018

python process_data.py ./GoogleNews-vectors-negative300.bin ./essays.csv ./mairesse.csv
loading data... data loaded!
number of status: 2467
vocab size: 30391
max sentence length: 149
loading word2vec vectors...
Traceback (most recent call last):
File "process_data.py", line 171, in
w2v = load_bin_vec(w2v_file, vocab)
File "process_data.py", line 104, in load_bin_vec
word.append(ch)
MemoryError

I feel that this problem is very likely because the binary data set read. /GoogleNews-vectors-negative300.bin is too large
Ask how to solve it? ? ?
How do everyone run?

@vivekraghu17
Copy link

import numpy as np
import theano
import pickle
from collections import defaultdict
import sys, re
import pandas as pd
import csv
import getpass

def build_data_cv(datafile, cv=10, clean_string=True):
"""
Loads data and split into 10 folds.
"""
revs = []
vocab = defaultdict(float)

with open(datafile, "r") as csvf:
    csvreader=csv.reader(csvf,delimiter=',',quotechar='"')
    first_line=True
    for line in csvreader:
        if first_line:
            first_line=False
            continue
        status=[]
        sentences=re.split(r'[.?]', line[1].strip())
        try:
            sentences.remove('')
        except ValueError:
            None

        for sent in sentences:
            if clean_string:
                orig_rev = clean_str(sent.strip())
                if orig_rev=='':
                        continue
                words = set(orig_rev.split())
                splitted = orig_rev.split()
                if len(splitted)>150:
                    orig_rev=[]
                    splits=int(np.floor(len(splitted)/20))
                    for index in range(splits):
                        orig_rev.append(' '.join(splitted[index*20:(index+1)*20]))
                    if len(splitted)>splits*20:
                        orig_rev.append(' '.join(splitted[splits*20:]))
                    status.extend(orig_rev)
                else:
                    status.append(orig_rev)
            else:
                orig_rev = sent.strip().lower()
                words = set(orig_rev.split())
                status.append(orig_rev)

            for word in words:
                vocab[word] += 1


        datum  = {"y0":1 if line[2].lower()=='y' else 0,
              "y1":1 if line[3].lower()=='y' else 0,
              "y2":1 if line[4].lower()=='y' else 0,
              "y3":1 if line[5].lower()=='y' else 0,
              "y4":1 if line[6].lower()=='y' else 0,
              "text": status,
              "user": line[0],
              "num_words": np.max([len(sent.split()) for sent in status]),
              "split": np.random.randint(0,cv)}
        revs.append(datum)


return revs, vocab

def get_W(word_vecs, k=300):
"""
Get word matrix. W[i] is the vector for word indexed by i
"""
vocab_size = len(word_vecs)
word_idx_map = dict()
W = np.zeros(shape=(vocab_size+1, k), dtype=theano.config.floatX)
W[0] = np.zeros(k, dtype=theano.config.floatX)
i = 1
for word in word_vecs:
W[i] = word_vecs[word]
word_idx_map[word] = i
i += 1
return W, word_idx_map

def load_bin_vec(fname, vocab):
"""
Loads 300x1 word vecs from Google (Mikolov) word2vec
"""
word_vecs = {}
with open(fname, "rb") as f:
header = f.readline()
vocab_size, layer1_size = list(map(int, header.split()))
binary_len = np.dtype(theano.config.floatX).itemsize * layer1_size
for line in range(vocab_size):
word = []
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
if tuple(word) in vocab:
word_vecs[tuple(word)] = np.fromstring(f.read(binary_len), dtype=theano.config.floatX)
else:
f.read(binary_len)
return word_vecs

def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
"""
For words that occur in at least min_df documents, create a separate word vector.
0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
"""
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
print(word)

def clean_str(string, TREC=False):
"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"'s", " 's ", string)
string = re.sub(r"'ve", " have ", string)
string = re.sub(r"n't", " not ", string)
string = re.sub(r"'re", " are ", string)
string = re.sub(r"'d" , " would ", string)
string = re.sub(r"'ll", " will ", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"(", " ( ", string)
string = re.sub(r")", " ) ", string)
string = re.sub(r"?", " ? ", string)

string = re.sub(r"[a-zA-Z]{4,}", "", string)

string = re.sub(r"\s{2,}", " ", string)
return string.strip() if TREC else string.strip().lower()

def clean_str_sst(string):
"""
Tokenization/string cleaning for the SST dataset
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()

def get_mairesse_features(file_name):
feats={}
with open(file_name, "r") as csvf:
csvreader=csv.reader(csvf,delimiter=',',quotechar='"')
for line in csvreader:
feats[line[0]]=[float(f) for f in line[1:]]
return feats

if name=="main":
w2v_file = sys.argv[1]
data_folder = sys.argv[2]
mairesse_file = sys.argv[3]
print("loading data...", end=' ')
revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True)
num_words=pd.DataFrame(revs)["num_words"]
max_l = np.max(num_words)
print("data loaded!")
print("number of status: " + str(len(revs)))
print("vocab size: " + str(len(vocab)))
print("max sentence length: " + str(max_l))
print("loading word2vec vectors...", end=' ')
w2v = load_bin_vec(w2v_file, vocab)
print("word2vec loaded!")
print("num words already in word2vec: " + str(len(w2v)))
add_unknown_words(w2v, vocab)
W, word_idx_map = get_W(w2v)
rand_vecs = {}
add_unknown_words(rand_vecs, vocab)
W2, _ = get_W(rand_vecs)
mairesse = get_mairesse_features(mairesse_file)
pickle.dump([revs, W, W2, word_idx_map, vocab, mairesse], open("essays_mairesse.p", "wb"))
print("dataset created!")

@vivekraghu17
Copy link

Copy paste that code it optimizes and also resolves the encoding issue :)

@wangjiwu
Copy link

Copy paste that code it optimizes and also resolves the encoding issue :)

Thank you for solving this code problem , however, the code indentation you sent does not seem to be uploaded. So could you please give the code with indentation, thanks.

@soujanyaporia
Copy link
Collaborator

Could anyone please submit a PR?

@amirmohammadkz
Copy link

amirmohammadkz commented Aug 14, 2019

@soujanyaporia
I submitted a PR.
I have made some other improvements in the training classes too. In my forked repository, you can see them. I can submit PR for them too.

priyansh19 pushed a commit to priyansh19/personality-detection that referenced this issue Jan 26, 2020
Solving the error of preprocessing not working

SenticNet#15
@priyansh19
Copy link

I have done some changes and now process_data.py file is working
Submitted a PR 👍
This is my Repository : https://github.com/priyansh19/personality-detection/blob/master/process_data.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

6 participants