Skip to content

Commit

Permalink
added data downloading script and processing script
Browse files Browse the repository at this point in the history
  • Loading branch information
hejia-zhang committed Apr 7, 2021
1 parent c1c3795 commit 347e59f
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 0 deletions.
2 changes: 2 additions & 0 deletions glove/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Here we store GloVe pre-trained models.

3 changes: 3 additions & 0 deletions glove/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

wget http://nlp.stanford.edu/data/glove.6B.zip
28 changes: 28 additions & 0 deletions glove/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""This is from https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76"""

import os.path as osp
import pickle

import bcolz
import numpy as np

words = []
idx = 0
word2idx = {}
glove_data_path = osp.join(osp.dirname(__file__))
vectors = bcolz.carray(np.zeros(1), rootdir=osp.join(glove_data_path, '6B.50.dat'), mode='w')

with open(osp.join(glove_data_path, 'glove.6B.50d.txt'), 'rb') as f:
for l in f:
line = l.decode().split()
word = line[0]
words.append(word)
word2idx[word] = idx
idx += 1
vect = np.array(line[1:]).astype(np.float32)
vectors.append(vect)

vectors = bcolz.carray(vectors[1:].reshape((400000, 50)), rootdir=osp.join(glove_data_path, '6B.50.dat'), mode='w')
vectors.flush()
pickle.dump(words, open(osp.join(glove_data_path, '6B.50_words.pkl'), 'wb'))
pickle.dump(word2idx, open(osp.join(glove_data_path, '6B.50_idx.pkl'), 'wb'))

0 comments on commit 347e59f

Please sign in to comment.