diff --git a/docs/tutorials/word_embedding/word_embedding.md b/docs/tutorials/word_embedding/word_embedding.md index 6557630e80..7c80a095b3 100644 --- a/docs/tutorials/word_embedding/word_embedding.md +++ b/docs/tutorials/word_embedding/word_embedding.md @@ -33,11 +33,11 @@ To begin, let's first import a few packages that we'll need for this example: import warnings warnings.filterwarnings('ignore') -from mxnet import gluon, nd +from mxnet import gluon, np import gluonnlp as nlp import re import collections -import numpy as np +import numpy as onp ``` @@ -160,7 +160,7 @@ For example, ```{.python .input} def simple(words): - return np.ones((len(words), 300)) + return onp.ones((len(words), 300)) matrix = nlp.embedding.load_embeddings(vocab, 'wiki.simple', unk_method=simple) ``` @@ -217,7 +217,7 @@ input_dim, output_dim = matrix.shape layer = gluon.nn.Embedding(input_dim, output_dim) layer.initialize() layer.weight.set_data(matrix) -layer(nd.array([5, 4]))[:, :5] +layer(np.array([5, 4]))[:, :5] ``` ### Creating Vocabulary from Pre-trained Word Embeddings @@ -257,18 +257,17 @@ To apply word embeddings, we need to define cosine similarity. Cosine similarity determines the similarity between two vectors. ```{.python .input} -import numpy as np def cos_sim(x, y): - return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) + return onp.dot(x, y) / (onp.linalg.norm(x) * onp.linalg.norm(y)) ``` The range of cosine similarity between two vectors can be between -1 and 1. The larger the value, the larger the similarity between the two vectors. ```{.python .input} -x = np.array([1, 2]) -y = np.array([10, 20]) -z = np.array([-1, -2]) +x = onp.array([1, 2]) +y = onp.array([10, 20]) +z = onp.array([-1, -2]) print(cos_sim(x, y)) print(cos_sim(x, z)) @@ -287,16 +286,16 @@ We can then find the indices for which the dot product is greatest (`topk`), whi ```{.python .input} def norm_vecs_by_row(x): - return x / np.sqrt(np.sum(x * x, axis=1) + 1E-10).reshape((-1,1)) + return x / onp.sqrt(onp.sum(x * x, axis=1) + 1E-10).reshape((-1,1)) def topk(res, k): - part = np.argpartition(res, -k)[-k:] - return part[np.argsort(res[part])].tolist()[::-1] + part = onp.argpartition(res, -k)[-k:] + return part[onp.argsort(res[part])].tolist()[::-1] def get_knn(vocab, matrix, k, word): word_vec = matrix[vocab[word]].reshape((-1, 1)) vocab_vecs = norm_vecs_by_row(matrix) - dot_prod = np.dot(vocab_vecs, word_vec) + dot_prod = onp.dot(vocab_vecs, word_vec) indices = topk(dot_prod.reshape((len(vocab), )), k=k+1) # Remove unknown and input tokens. return vocab.to_tokens(indices[1:]) @@ -351,7 +350,7 @@ def get_top_k_by_analogy(vocab, matrix, k, word1, word2, word3): word_vecs = [matrix[vocab[word]] for word in [word1, word2, word3]] word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1)) vocab_vecs = norm_vecs_by_row(matrix) - dot_prod = np.dot(vocab_vecs, word_diff) + dot_prod = onp.dot(vocab_vecs, word_diff) indices = topk(dot_prod.reshape((len(vocab), )), k=k) return vocab.to_tokens(indices) ``` diff --git a/setup.py b/setup.py index baf44e6110..0297e4eeed 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ def find_version(*file_paths): 'contextvars', 'pyarrow', 'sentencepiece==0.1.95', - 'protobuf', + 'protobuf<=3.20.1', 'pandas', 'tokenizers==0.9.4', 'dataclasses;python_version<"3.7"', # Dataclass for python <= 3.6 diff --git a/tests/test_utils_misc.py b/tests/test_utils_misc.py index de6b3198aa..6515cdf7d5 100644 --- a/tests/test_utils_misc.py +++ b/tests/test_utils_misc.py @@ -52,6 +52,7 @@ def test_download_s3(overwrite): overwrite=overwrite) +@pytest.mark.skip("RuntimeError: Failed downloading url https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2014-41/cc-index.paths.gz") @pytest.mark.remote_required @pytest.mark.parametrize('overwrite', [False, True]) def test_download_https(overwrite):