diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b16bf79 --- /dev/null +++ b/.gitignore @@ -0,0 +1,173 @@ +### Project +sent_eval/data/senteval_data +skip_thoughts/model + +/bazel-bin +/bazel-ci_build-cache +/bazel-genfiles +/bazel-out +/bazel-skip_thoughts +/bazel-testlogs +/bazel-tf +*.pyc +*~ + + +### macOS template +*.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/ + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject +sent_eval/examples/glove/ +results/ +.snakemake/ +.swp + + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..37ce346 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2017 Babylon Partners. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..c739223 --- /dev/null +++ b/NOTICE @@ -0,0 +1,13 @@ +Decoding Decoders +Copyright 2018 Babylon Partners. + +This repository includes software developed +at Babylon Partners (babylonhealth.com). + +Portions of this software were developed by +The TensorFlow Authors. +https://github.com/tensorflow/models/blob/master/AUTHORS + +This software contains code derived from +The TensorFlow Authors. +https://github.com/tensorflow/models/blob/master/AUTHORS diff --git a/README.md b/README.md new file mode 100644 index 0000000..02c40ba --- /dev/null +++ b/README.md @@ -0,0 +1,188 @@ +# Decoding Decoders: Finding Optimal Representation Spaces for Unsupervised Similarity Tasks + +TensorFlow implementation of the models described in the +[Decoding Decoders](https://openreview.net/forum?id=SJOOAEJwf) paper. + +This codebase builds on top of [Tensorflow Skip-Thought](https://github.com/tensorflow/models/tree/master/research/skip_thoughts) implementation by Chris Shallue +and uses [SentEval](https://github.com/facebookresearch/SentEval) from Facebook for evaluations on transfer tasks. + +The aim is to study how different choices of decoders affect the performance on unsupervised similarity tasks, such as STS. + + +## Contents +* [Requirements](#requirements) +* [Data Preprocessing](#data-preprocessing) +* [Training](#training) +* [Vocabulary Expansion](#vocabulary-expansion) +* [Evaluation](#evaluation) + +## Requirements + +This code uses Python 2.7. Please install the requirements in `requirements.txt`. + + +## Data Preprocessing + +### Preparation + +You will need to obtain the BookCorpus dataset from [this website](http://yknzhu.wixsite.com/mbweb). + +### Quick run +```shell + +# Comma-separated list of globs matching the input files. The format of +# the input files is assumed to be a list of newline-separated sentences, where +# each sentence is already tokenized. +INPUT_FILES= + +# Location to save the preprocessed training and validation data. +DATA_DIR= + +# Run the preprocessing script. +python -m skip_thoughts.data.preprocess_dataset \ + --input_files=${INPUT_FILES} \ + --output_dir=${DATA_DIR} +``` + + +## Training + +### Training params +We added a couple of new parameters in the `train.py` script. +The most important ones are described here, please see the code to see additional functionality we have added. + +**`--decoder=SEQxSKGy`** where `x`, `y` can be `0`, `1`, `2`, and `3`. + +SEQ stands for sequence (recurrent) decoder and SKG stands for bag-of-words (BOW) decoder. +* `0` - no decoder of this type is present +* `1` - decoder for the current sentence (Autoencoder) +* `2` - decoders for the previous and next sentences (Skip-Though/FastSent style) +* `3` - decoders for previous, current, and next sentences (Skip-Thought + Autoencoder) +Note that it is possible to combine SEQ and SKG + +**`--skipgram_encoder=True|False`** + +* `True` The architecture has a bag-of-words (BOW) encoder. +* `False` The architecture has a sequence (RNN) encoder. + +Defaults to `False`. + +### Quick run +```shell +# Directory containing the preprocessed data. +DATA_DIR= + +# Directory to save the model. Note: A new folder will be created in here called run_{unixtimestamp}. Into this folder, the model checkpoints will be saved. Also, the FLAGS wile, as well as its dict and json representations will be stored as `flags.pkl`, `config.pkl` and `config.json` respectively. +RUN_DIR= + +# Model decoder configuration (choose one of SEQ0SKG2 SEQ0SKG3 SEQ2SKG2 or SEQ3SKG3) +DECODER="SEQ0SKG2" + +# Whether to use skipgram (BOW) encoder (choose True or False). Defaults to False. +SKIPGRAM_ENCODER=False + +# Run the training script. + +python -m skip_thoughts.train \ + --input_file_pattern="${DATA_DIR}/train-?????-of-00100" \ + --run_dir="${RUN_DIR}" \ + --decoder="${DECODER}" \ + --skipgram_encoder="${SKIPGRAM_ENCODER}" +``` +This will train a model with an RNN encoder and 2 BOW decoders. + +## Vocabulary Expansion + +### Preparation + +You will need to download the pretrained Google News word2vec vectors, found [here](https://code.google.com/archive/p/word2vec/). +Please see the SkipThought readme for more details on vocab expansion. + +### Quick run +```shell +MODEL_DIR= +SKIP_THOUGHTS_VOCAB= +W2VMODEL= +LOG_FILE= + +python -m skip_thoughts.vocabulary_expansion \ + --skip_thoughts_model="${MODEL_DIR}" \ + --skip_thoughts_vocab="${SKIP_THOUGHTS_VOCAB}" \ + --word2vec_model="${W2VMODEL}" \ + --output_dir="${MODEL_DIR}" \ + > "${LOG_FILE}" 2>&1 +``` + + +## Evaluation + +### Preparation + +You will need to clone the [SentEval repo](https://github.com/facebookresearch/SentEval) and download the data as instructed there. +Then copy our scripts from `sent_eval/evaluation` to the `examples` directory to run. + +### The scripts + +The [`SentEval` evaluation scripts](/sent_eval/evaluation) either use the encoder output (which we confusingly call `context` here), or the unrolled decoder (which we less confusingly call `unroll` for the similarity and transfer tasks. + +The similarity scripts +[similarity_context.py](/sent_eval/evaluation/similarity_context.py) and +[similarity_unroll.py](/sent_eval/evaluation/similarity_context.py) +run the `STS*` tasks (`STS12`, `STS13`, `STS14`, `STS15` and `STS16`) of `SentEval`. + +The transfer scripts +[transfer_context.py](/sent_eval/evaluation/transfer_context.py) and +[transfer_unroll.py](/sent_eval/evaluation/transfer_context.py) +run the transfer tasks (`CR`, `MR`, `MPQA`, `SUBJ`, `SST`, `TREC`, `MRPC`, + `SICKRelatedness`, `SICKEntailment` and `STSBenchmark`) of `SentEval`. + +Each script runs with 10-fold cross validation, and saves the dictionary of all results as a pickle to the desired location. This can then be used for easy generation of plots and other analysis. + +#### Context + +The context scripts +[similarity_context.py](/sent_eval/evaluation/similarity_context.py) and +[transfer_context.py](/sent_eval/evaluation/transfer_context.py) work for all decoder types. + +The parameters of the context scripts are: + ++ `--model_dir` The path to the saved model you want to evaluate. Specifically, this should include this should be a folder containined checkpoint and decoder configuration information produced by [`train.py`](/skip_thoughts/train.py). ++ `--output_results_path` The full path to save the pickle file containing all of the results from this evaluation. + +#### Unroll + +The unroll scripts +[similarity_unroll.py](/sent_eval/evaluation/similarity_context.py) and +[transfer_unroll.py](/sent_eval/evaluation/transfer_context.py) only work for RNN decoder types, and use the [decoder unrolling mechanism](/unrolling_the_decoder.md) discussed in the [Decoding Decoders](https://openreview.net/forum?id=SJOOAEJwf) paper. + +In addition to the parameters of the context scripts (above), the unroll scripts require the following parameters: + ++ `--unroll_length` This should be a positive integer, and corresponds to how many time steps each decoder will "unroll" to produce the sentence representation. ++ `--decoder_type` This should be either `'mean'` or `'concat'` and corresponds to either taking the sentence representation as the mean or concatentation over the unrolled hidden states respectively. + +### Quick run +The example below is for running `similarity_context.py`, the exact same process will work for the other evaluation scripts. +```shell +# Directory to load the model from +MODEL_DIR= + +# Which GPU(s) to use (choose from e.g. one of [0 1 0,1]) +GPU_IDS=0 + +# Log file +LOG_FILE= + +# Pickle save path +PICKLE_PATH= + +# Run the evaluation script. +CUDA_VISIBLE_DEVICES=$GPU_IDS \ + python -m sent_eval.evaluation.similarity_context \ + --model_dir="${MODEL_DIR}" \ + --output_results_path="${PICKLE_PATH}" \ + > "${LOG_FILE}" 2>&1 +``` + +## Contact + +Vitalii Zhelezniak diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..e69de29 diff --git a/images/unroll.png b/images/unroll.png new file mode 100644 index 0000000..480b604 Binary files /dev/null and b/images/unroll.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6d433d2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +gensim==3.2.0 +nltk==3.2.4 +numpy==1.13.1 +scikit-learn==0.19.0 +scipy==0.19.1 +tensorflow==1.3.0 +tqdm==4.15.0 diff --git a/sent_eval/evaluation/__init__.py b/sent_eval/evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sent_eval/evaluation/similarity_context.py b/sent_eval/evaluation/similarity_context.py new file mode 100644 index 0000000..ab3a555 --- /dev/null +++ b/sent_eval/evaluation/similarity_context.py @@ -0,0 +1,136 @@ +# Copyright 2018 Babylon Partners. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import cPickle +import sys + +from skip_thoughts import configuration +from skip_thoughts import encoder_manager + +import tensorflow as tf + +import logging +import skip_thoughts.experiments as experiments + +from sent_eval.examples.exutil import dotdict + + +FLAGS = tf.flags.FLAGS + + +tf.flags.DEFINE_string("model_dir", None, + "Directory for saving and loading checkpoints.") +tf.flags.DEFINE_string("output_results_path", None, + "Path to save pickled results to.") +tf.flags.DEFINE_bool("use_eos", True, + "If to use the eos token during encoder unroll.") + +if not FLAGS.model_dir: + raise ValueError("--model_dir is required.") +if not FLAGS.output_results_path: + raise ValueError("--output_results_path is required.") + +# Set paths to the model. +VOCAB_FILE = os.path.join(FLAGS.model_dir, "vocab.txt") +EMBEDDING_MATRIX_FILE = os.path.join(FLAGS.model_dir, "embeddings.npy") +CHECKPOINT_PATH = FLAGS.model_dir +FLAGS_PICKLE_PATH = os.path.join(FLAGS.model_dir, "flags.pkl") + +# Load the configuration used to make the model +with open(FLAGS_PICKLE_PATH, 'r') as f: + model_flags = cPickle.load(f) + +decoder_config = experiments.get_decoder_config(flags=model_flags) +model_config = configuration.model_config( + input_file_pattern=model_flags.input_file_pattern, + vocab_size=model_flags.vocab_size, + batch_size=model_flags.batch_size, + word_embedding_dim=model_flags.word_dim, + encoder_dim=model_flags.encoder_dim, + skipgram_encoder=model_flags.skipgram_encoder, + sequence_decoder_pre=decoder_config.sequence_decoder_pre, + sequence_decoder_cur=decoder_config.sequence_decoder_cur, + sequence_decoder_post=decoder_config.sequence_decoder_post, + skipgram_decoder_pre=decoder_config.skipgram_decoder_pre, + skipgram_decoder_cur=decoder_config.skipgram_decoder_cur, + skipgram_decoder_post=decoder_config.skipgram_decoder_post, + share_weights_logits=model_flags.share_weights_logits, + normalise_decoder_losses=model_flags.normalise_decoder_losses, + skipgram_prefactor=model_flags.skipgram_prefactor, + sequence_prefactor=model_flags.sequence_prefactor) + +# Set up the encoder. Here we are using a single unidirectional model. +# To use a bidirectional model as well, call load_model() again with +# configuration.model_config(bidirectional_encoder=True) and paths to the +# bidirectional model's files. The encoder will use the concatenation of +# all loaded models. +encoder = encoder_manager.EncoderManager() +encoder.load_model(model_config=model_config, + vocabulary_file=VOCAB_FILE, + embedding_matrix_file=EMBEDDING_MATRIX_FILE, + checkpoint_path=CHECKPOINT_PATH) + +# encodings = encoder.encode(data) + + +# Set PATHs +current_path = os.path.dirname(__file__) +PATH_TO_SENTEVAL = os.path.join(current_path, '../') +PATH_TO_DATA = os.path.join(current_path, '../data/senteval_data') + +# import SentEval +sys.path.insert(0, PATH_TO_SENTEVAL) +import senteval + + +# consider the option of lower-casing or not for bow. +def prepare(params, samples): + params.batch_size = 128 + # set to 10 to be comparable to published results + params.kfold = 10 + return + + +def batcher(params, batch): + batch = [" ".join(sent) if sent != [] else " ".join(['.']) + for sent in batch] + return encoder.encode(batch, use_eos=FLAGS.use_eos) + + +# Set params for SentEval +params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10} +params_senteval = dotdict(params_senteval) + +# Set up logger +logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) + +transfer_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST', 'TREC', 'MRPC', + 'SICKRelatedness', 'SICKEntailment', 'STSBenchmark', + 'SNLI', 'ImageCaptionRetrieval'] + +similarity_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16'] + +if __name__ == "__main__": + se = senteval.SentEval(params_senteval, batcher, prepare) + tasks = similarity_tasks + results = se.eval(tasks) + f = open(FLAGS.output_results_path, 'wb') + cPickle.dump(results, f) + f.close() diff --git a/sent_eval/evaluation/similarity_unroll.py b/sent_eval/evaluation/similarity_unroll.py new file mode 100644 index 0000000..6de661e --- /dev/null +++ b/sent_eval/evaluation/similarity_unroll.py @@ -0,0 +1,203 @@ +# Copyright 2018 Babylon Partners. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import cPickle +import sys + +from skip_thoughts import configuration +from skip_thoughts import decode +from skip_thoughts import encoder_manager +from skip_thoughts import experiments + +import numpy as np +import tensorflow as tf + +import logging + +from sent_eval.examples.exutil import dotdict + + +FLAGS = tf.flags.FLAGS + + +tf.flags.DEFINE_string("model_dir", None, + "Directory for saving and loading checkpoints.") +tf.flags.DEFINE_string("output_results_path", None, + "Path to save pickled results to.") +tf.flags.DEFINE_bool("use_eos", True, + "If to use the eos token during encoder unroll.") +tf.flags.DEFINE_integer("unroll_length", None, + "If to use the eos token during encoder unroll.") +tf.flags.DEFINE_string("decoder_type", None, + "If to use the eos token during encoder unroll.") + +if not FLAGS.model_dir: + raise ValueError("--model_dir is required.") +if not FLAGS.output_results_path: + raise ValueError("--output_results_path is required.") +if not FLAGS.unroll_length: + raise ValueError("--unroll_length is required.") + +decoder_types = ['mean', 'concat'] +if FLAGS.decoder_type not in decoder_types: + raise ValueError("--decoder_type must be one of {t}".format(t=decoder_types)) + +# Set paths to the model. +VOCAB_FILE = os.path.join(FLAGS.model_dir, "vocab.txt") +EMBEDDING_MATRIX_FILE = os.path.join(FLAGS.model_dir, "embeddings.npy") +CHECKPOINT_PATH = FLAGS.model_dir +FLAGS_PICKLE_PATH = os.path.join(FLAGS.model_dir, "flags.pkl") + +# Load the configuration used to make the model +with open(FLAGS_PICKLE_PATH, 'r') as f: + model_flags = cPickle.load(f) + +decoder_config = experiments.get_decoder_config(flags=model_flags) +model_config = configuration.model_config( + input_file_pattern=model_flags.input_file_pattern, + vocab_size=model_flags.vocab_size, + batch_size=model_flags.batch_size, + word_embedding_dim=model_flags.word_dim, + encoder_dim=model_flags.encoder_dim, + skipgram_encoder=model_flags.skipgram_encoder, + sequence_decoder_pre=decoder_config.sequence_decoder_pre, + sequence_decoder_cur=decoder_config.sequence_decoder_cur, + sequence_decoder_post=decoder_config.sequence_decoder_post, + skipgram_decoder_pre=decoder_config.skipgram_decoder_pre, + skipgram_decoder_cur=decoder_config.skipgram_decoder_cur, + skipgram_decoder_post=decoder_config.skipgram_decoder_post, + share_weights_logits=model_flags.share_weights_logits, + normalise_decoder_losses=model_flags.normalise_decoder_losses, + skipgram_prefactor=model_flags.skipgram_prefactor, + sequence_prefactor=model_flags.sequence_prefactor) + +# Set up the encoder. Here we are using a single unidirectional model. +# To use a bidirectional model as well, call load_model() again with +# configuration.model_config(bidirectional_encoder=True) and paths to the +# bidirectional model's files. The encoder will use the concatenation of +# all loaded models. +encoder = encoder_manager.EncoderManager() +encoder.load_model(model_config=model_config, + vocabulary_file=VOCAB_FILE, + embedding_matrix_file=EMBEDDING_MATRIX_FILE, + checkpoint_path=CHECKPOINT_PATH, + mode='encode-decode') + +# Build the decoder +g = encoder.graph +sess = encoder.sessions[0] + +tensor_names_global = { + 'word_embedding': 'word_embedding:0'} + +tensor_names_pre = { + 'logits': 'logits/logits/decoder_pre:0', + 'decoder_output': 'decoder_pre/decoder_output:0', + 'decoder_state': 'decoder_pre/decoder_state:0'} + +tensor_names_post = { + 'logits': 'logits_1/logits/decoder_post:0', + 'decoder_output': 'decoder_post/decoder_output:0', + 'decoder_state': 'decoder_post/decoder_state:0'} + +decoder_pre = decode.Decoder( + g=g, + tensor_names_decoder=tensor_names_pre, + tensor_names_global=tensor_names_global) + +decoder_post = decode.Decoder( + g=g, + tensor_names_decoder=tensor_names_post, + tensor_names_global=tensor_names_global) + + +# encodings = encoder.encode(data) + + +# Set PATHs +current_path = os.path.dirname(__file__) +PATH_TO_SENTEVAL = os.path.join(current_path, '../') +PATH_TO_DATA = os.path.join(current_path, '../data/senteval_data') + +# import SentEval +sys.path.insert(0, PATH_TO_SENTEVAL) +import senteval + + +# consider the option of lower-casing or not for bow. +def prepare(params, samples): + params.batch_size = 128 + # set to 10 to be comparable to published results + params.kfold = 10 + return + + +def batcher_steps(steps, decoder_type): + def batcher(params, batch): + batch = [" ".join(sent) if sent != [] else " ".join(['.']) + for sent in batch] + + decode_pre_rep, decode_post_rep = decode.decode( + sess=sess, data=batch, + encoder=encoder, + decoder_pre=decoder_pre, + decoder_post=decoder_post, + steps=steps, + use_eos=FLAGS.use_eos) + + decode_rep_concat = np.concatenate( + (np.array(decode_pre_rep), np.array(decode_post_rep)), axis=1) + + if decoder_type == 'mean': + return np.mean(decode_rep_concat, axis=1) + + this_batch_size = len(decode_rep_concat) + return np.reshape(decode_rep_concat, (this_batch_size, -1)) + + return batcher + + +# Set params for SentEval +params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10} +params_senteval = dotdict(params_senteval) + +# Set up logger +logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) + +transfer_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST', 'TREC', 'MRPC', + 'SICKRelatedness', 'SICKEntailment', 'STSBenchmark', + 'SNLI', 'ImageCaptionRetrieval'] + +similarity_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16'] + +if __name__ == "__main__": + print( + "Building batcher with unroll length {ul} and decoder type {d}".format( + ul=FLAGS.unroll_length, d=FLAGS.decoder_type)) + + batcher = batcher_steps(steps=FLAGS.unroll_length, + decoder_type=FLAGS.decoder_type) + + se = senteval.SentEval(params_senteval, batcher, prepare) + tasks = similarity_tasks + results = se.eval(tasks) + f = open(FLAGS.output_results_path, 'wb') + cPickle.dump(results, f) + f.close() diff --git a/sent_eval/evaluation/transfer_context.py b/sent_eval/evaluation/transfer_context.py new file mode 100644 index 0000000..8f5ef4d --- /dev/null +++ b/sent_eval/evaluation/transfer_context.py @@ -0,0 +1,135 @@ +# Copyright 2018 Babylon Partners. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import cPickle +import sys + +from skip_thoughts import configuration +from skip_thoughts import encoder_manager + +import tensorflow as tf + +import logging +import skip_thoughts.experiments as experiments + +from sent_eval.examples.exutil import dotdict + + +FLAGS = tf.flags.FLAGS + + +tf.flags.DEFINE_string("model_dir", None, + "Directory for saving and loading checkpoints.") +tf.flags.DEFINE_string("output_results_path", None, + "Path to save pickled results to.") +tf.flags.DEFINE_bool("use_eos", True, + "If to use the eos token during encoder unroll.") + +if not FLAGS.model_dir: + raise ValueError("--model_dir is required.") +if not FLAGS.output_results_path: + raise ValueError("--output_results_path is required.") + +# Set paths to the model. +VOCAB_FILE = os.path.join(FLAGS.model_dir, "vocab.txt") +EMBEDDING_MATRIX_FILE = os.path.join(FLAGS.model_dir, "embeddings.npy") +CHECKPOINT_PATH = FLAGS.model_dir +FLAGS_PICKLE_PATH = os.path.join(FLAGS.model_dir, "flags.pkl") + +# Load the configuration used to make the model +with open(FLAGS_PICKLE_PATH, 'r') as f: + model_flags = cPickle.load(f) + +decoder_config = experiments.get_decoder_config(flags=model_flags) +model_config = configuration.model_config( + input_file_pattern=model_flags.input_file_pattern, + vocab_size=model_flags.vocab_size, + batch_size=model_flags.batch_size, + word_embedding_dim=model_flags.word_dim, + encoder_dim=model_flags.encoder_dim, + skipgram_encoder=model_flags.skipgram_encoder, + sequence_decoder_pre=decoder_config.sequence_decoder_pre, + sequence_decoder_cur=decoder_config.sequence_decoder_cur, + sequence_decoder_post=decoder_config.sequence_decoder_post, + skipgram_decoder_pre=decoder_config.skipgram_decoder_pre, + skipgram_decoder_cur=decoder_config.skipgram_decoder_cur, + skipgram_decoder_post=decoder_config.skipgram_decoder_post, + share_weights_logits=model_flags.share_weights_logits, + normalise_decoder_losses=model_flags.normalise_decoder_losses, + skipgram_prefactor=model_flags.skipgram_prefactor, + sequence_prefactor=model_flags.sequence_prefactor) + +# Set up the encoder. Here we are using a single unidirectional model. +# To use a bidirectional model as well, call load_model() again with +# configuration.model_config(bidirectional_encoder=True) and paths to the +# bidirectional model's files. The encoder will use the concatenation of +# all loaded models. +encoder = encoder_manager.EncoderManager() +encoder.load_model(model_config=model_config, + vocabulary_file=VOCAB_FILE, + embedding_matrix_file=EMBEDDING_MATRIX_FILE, + checkpoint_path=CHECKPOINT_PATH) + +# encodings = encoder.encode(data) + + +# Set PATHs +current_path = os.path.dirname(__file__) +PATH_TO_SENTEVAL = os.path.join(current_path, '../') +PATH_TO_DATA = os.path.join(current_path, '../data/senteval_data') + +# import SentEval +sys.path.insert(0, PATH_TO_SENTEVAL) +import senteval + + +# consider the option of lower-casing or not for bow. +def prepare(params, samples): + params.batch_size = 128 + # set to 10 to be comparable to published results + params.kfold = 10 + return + + +def batcher(params, batch): + batch = [" ".join(sent) if sent != [] else " ".join(['.']) + for sent in batch] + return encoder.encode(batch, use_eos=FLAGS.use_eos) + + +# Set params for SentEval +params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10} +params_senteval = dotdict(params_senteval) + +# Set up logger +logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) + +transfer_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST', 'TREC', 'MRPC', + 'SICKRelatedness', 'SICKEntailment', 'STSBenchmark'] + +similarity_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16'] + +if __name__ == "__main__": + se = senteval.SentEval(params_senteval, batcher, prepare) + tasks = transfer_tasks + results = se.eval(tasks) + f = open(FLAGS.output_results_path, 'wb') + cPickle.dump(results, f) + f.close() diff --git a/sent_eval/evaluation/transfer_unroll.py b/sent_eval/evaluation/transfer_unroll.py new file mode 100644 index 0000000..99a1862 --- /dev/null +++ b/sent_eval/evaluation/transfer_unroll.py @@ -0,0 +1,198 @@ +# Copyright 2018 Babylon Partners. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import cPickle +import sys + +from skip_thoughts import configuration +from skip_thoughts import decode +from skip_thoughts import encoder_manager +from skip_thoughts import experiments + +import numpy as np +import tensorflow as tf + +import logging + +from sent_eval.examples.exutil import dotdict + + +FLAGS = tf.flags.FLAGS + + +tf.flags.DEFINE_string("model_dir", None, + "Directory for saving and loading checkpoints.") +tf.flags.DEFINE_string("output_results_path", None, + "Path to save pickled results to.") +tf.flags.DEFINE_bool("use_eos", True, + "If to use the eos token during encoder unroll.") +tf.flags.DEFINE_integer("unroll_length", None, + "If to use the eos token during encoder unroll.") +tf.flags.DEFINE_string("decoder_type", None, + "If to use the eos token during encoder unroll.") + +if not FLAGS.model_dir: + raise ValueError("--model_dir is required.") +if not FLAGS.output_results_path: + raise ValueError("--output_results_path is required.") +if not FLAGS.unroll_length: + raise ValueError("--unroll_length is required.") + +decoder_types = ['mean', 'concat'] +if FLAGS.decoder_type not in decoder_types: + raise ValueError("--decoder_type must be one of {t}".format(t=decoder_types)) + +# Set paths to the model. +VOCAB_FILE = os.path.join(FLAGS.model_dir, "vocab.txt") +EMBEDDING_MATRIX_FILE = os.path.join(FLAGS.model_dir, "embeddings.npy") +CHECKPOINT_PATH = FLAGS.model_dir +FLAGS_PICKLE_PATH = os.path.join(FLAGS.model_dir, "flags.pkl") + +# Load the configuration used to make the model +with open(FLAGS_PICKLE_PATH, 'r') as f: + model_flags = cPickle.load(f) + +decoder_config = experiments.get_decoder_config(flags=model_flags) +model_config = configuration.model_config( + input_file_pattern=model_flags.input_file_pattern, + vocab_size=model_flags.vocab_size, + batch_size=model_flags.batch_size, + word_embedding_dim=model_flags.word_dim, + encoder_dim=model_flags.encoder_dim, + skipgram_encoder=model_flags.skipgram_encoder, + sequence_decoder_pre=decoder_config.sequence_decoder_pre, + sequence_decoder_cur=decoder_config.sequence_decoder_cur, + sequence_decoder_post=decoder_config.sequence_decoder_post, + skipgram_decoder_pre=decoder_config.skipgram_decoder_pre, + skipgram_decoder_cur=decoder_config.skipgram_decoder_cur, + skipgram_decoder_post=decoder_config.skipgram_decoder_post, + share_weights_logits=model_flags.share_weights_logits, + normalise_decoder_losses=model_flags.normalise_decoder_losses, + skipgram_prefactor=model_flags.skipgram_prefactor, + sequence_prefactor=model_flags.sequence_prefactor) + +# Set up the encoder. Here we are using a single unidirectional model. +# To use a bidirectional model as well, call load_model() again with +# configuration.model_config(bidirectional_encoder=True) and paths to the +# bidirectional model's files. The encoder will use the concatenation of +# all loaded models. +encoder = encoder_manager.EncoderManager() +encoder.load_model(model_config=model_config, + vocabulary_file=VOCAB_FILE, + embedding_matrix_file=EMBEDDING_MATRIX_FILE, + checkpoint_path=CHECKPOINT_PATH, + mode='encode-decode') + +# Build the decoder +g = encoder.graph +sess = encoder.sessions[0] + +tensor_names_global = { + 'word_embedding': 'word_embedding:0'} + +tensor_names_pre = { + 'logits': 'logits/logits/decoder_pre:0', + 'decoder_output': 'decoder_pre/decoder_output:0', + 'decoder_state': 'decoder_pre/decoder_state:0'} + +tensor_names_post = { + 'logits': 'logits_1/logits/decoder_post:0', + 'decoder_output': 'decoder_post/decoder_output:0', + 'decoder_state': 'decoder_post/decoder_state:0'} + +decoder_pre = decode.Decoder( + g=g, + tensor_names_decoder=tensor_names_pre, + tensor_names_global=tensor_names_global) + +decoder_post = decode.Decoder( + g=g, + tensor_names_decoder=tensor_names_post, + tensor_names_global=tensor_names_global) + + +# encodings = encoder.encode(data) + + +# Set PATHs +current_path = os.path.dirname(__file__) +PATH_TO_SENTEVAL = os.path.join(current_path, '../') +PATH_TO_DATA = os.path.join(current_path, '../data/senteval_data') + +# import SentEval +sys.path.insert(0, PATH_TO_SENTEVAL) +import senteval + + +# consider the option of lower-casing or not for bow. +def prepare(params, samples): + params.batch_size = 32 + # set to 10 to be comparable to published results + params.kfold = 10 + return + + +def batcher_steps(steps, decoder_type): + def batcher(params, batch): + batch = [" ".join(sent) if sent != [] else " ".join(['.']) + for sent in batch] + + decode_pre_rep, decode_post_rep = decode.decode( + sess=sess, data=batch, + encoder=encoder, + decoder_pre=decoder_pre, + decoder_post=decoder_post, + steps=steps, + use_eos=FLAGS.use_eos) + + decode_rep_concat = np.concatenate( + (np.array(decode_pre_rep), np.array(decode_post_rep)), axis=1) + + if decoder_type == 'mean': + return np.mean(decode_rep_concat, axis=1) + + this_batch_size = len(decode_rep_concat) + return np.reshape(decode_rep_concat, (this_batch_size, -1)) + + return batcher + + +# Set params for SentEval +params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10} +params_senteval = dotdict(params_senteval) + +# Set up logger +logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) + +transfer_tasks = [ #'CR', 'MR', 'MPQA', 'SUBJ', 'SST', 'TREC', 'MRPC', + 'SICKRelatedness', 'SICKEntailment'] #, 'STSBenchmark'] + +similarity_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16'] + +if __name__ == "__main__": + batcher = batcher_steps(steps=FLAGS.unroll_length, + decoder_type=FLAGS.decoder_type) + + se = senteval.SentEval(params_senteval, batcher, prepare) + tasks = transfer_tasks + results = se.eval(tasks) + f = open(FLAGS.output_results_path, 'wb') + cPickle.dump(results, f) + f.close() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9d9acc8 --- /dev/null +++ b/setup.py @@ -0,0 +1,16 @@ +from setuptools import setup, find_packages + +with open('README.md') as f: + readme = f.read() + +setup( + name='decoding_decoders', + version='0.0.1', + description='Decoding Decoders: Finding Optimal Representation Spaces' + ' for Unsupervised Similarity Tasks', + long_description=readme, + author='Bablyon AI Research', + author_email='nils.hammerla@babylonhealth.com', + url='https://github.com/Babylonpartners/decoding-decoders', + packages=find_packages() +) diff --git a/skip_thoughts/BUILD b/skip_thoughts/BUILD new file mode 100644 index 0000000..40ecd50 --- /dev/null +++ b/skip_thoughts/BUILD @@ -0,0 +1,94 @@ +package(default_visibility = [":internal"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +package_group( + name = "internal", + packages = [ + "//skip_thoughts/...", + ], +) + +py_library( + name = "configuration", + srcs = ["configuration.py"], + srcs_version = "PY2AND3", +) + +py_library( + name = "skip_thoughts_model", + srcs = ["skip_thoughts_model.py"], + srcs_version = "PY2AND3", + deps = [ + "//skip_thoughts/ops:gru_cell", + "//skip_thoughts/ops:input_ops", + ], +) + +py_test( + name = "skip_thoughts_model_test", + size = "large", + srcs = ["skip_thoughts_model_test.py"], + deps = [ + ":configuration", + ":skip_thoughts_model", + ], +) + +py_binary( + name = "train", + srcs = ["train.py"], + srcs_version = "PY2AND3", + deps = [ + ":configuration", + ":skip_thoughts_model", + ], +) + +py_binary( + name = "track_perplexity", + srcs = ["track_perplexity.py"], + srcs_version = "PY2AND3", + deps = [ + ":configuration", + ":skip_thoughts_model", + ], +) + +py_binary( + name = "vocabulary_expansion", + srcs = ["vocabulary_expansion.py"], + srcs_version = "PY2AND3", +) + +py_library( + name = "skip_thoughts_encoder", + srcs = ["skip_thoughts_encoder.py"], + srcs_version = "PY2AND3", + deps = [ + ":skip_thoughts_model", + "//skip_thoughts/data:special_words", + ], +) + +py_library( + name = "encoder_manager", + srcs = ["encoder_manager.py"], + srcs_version = "PY2AND3", + deps = [ + ":skip_thoughts_encoder", + ], +) + +py_binary( + name = "evaluate", + srcs = ["evaluate.py"], + srcs_version = "PY2AND3", + deps = [ + ":encoder_manager", + "//skip_thoughts:configuration", + ], +) + diff --git a/skip_thoughts/README.md b/skip_thoughts/README.md new file mode 100644 index 0000000..96f4e85 --- /dev/null +++ b/skip_thoughts/README.md @@ -0,0 +1,475 @@ +# Skip-Thought Vectors + +This is a TensorFlow implementation of the model described in: + +Jamie Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, +Antonio Torralba, Raquel Urtasun, Sanja Fidler. +[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf). +*In NIPS, 2015.* + + +## Contact +***Code author:*** Chris Shallue + +***Pull requests and issues:*** @cshallue + +## Contents +* [Model Overview](#model-overview) +* [Getting Started](#getting-started) + * [Install Required Packages](#install-required-packages) + * [Download Pretrained Models (Optional)](#download-pretrained-models-optional) +* [Training a Model](#training-a-model) + * [Prepare the Training Data](#prepare-the-training-data) + * [Run the Training Script](#run-the-training-script) + * [Track Training Progress](#track-training-progress) +* [Expanding the Vocabulary](#expanding-the-vocabulary) + * [Overview](#overview) + * [Preparation](#preparation) + * [Run the Vocabulary Expansion Script](#run-the-vocabulary-expansion-script) +* [Evaluating a Model](#evaluating-a-model) + * [Overview](#overview-1) + * [Preparation](#preparation-1) + * [Run the Evaluation Tasks](#run-the-evaluation-tasks) +* [Encoding Sentences](#encoding-sentences) + +## Model overview + +The *Skip-Thoughts* model is a sentence encoder. It learns to encode input +sentences into a fixed-dimensional vector representation that is useful for many +tasks, for example to detect paraphrases or to classify whether a product review +is positive or negative. See the +[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf) +paper for details of the model architecture and more example applications. + +A trained *Skip-Thoughts* model will encode similar sentences nearby each other +in the embedding vector space. The following examples show the nearest neighbor by +cosine similarity of some sentences from the +[movie review dataset](https://www.cs.cornell.edu/people/pabo/movie-review-data/). + + +| Input sentence | Nearest Neighbor | +|----------------|------------------| +| Simplistic, silly and tedious. | Trite, banal, cliched, mostly inoffensive. | +| Not so much farcical as sour. | Not only unfunny, but downright repellent. | +| A sensitive and astute first feature by Anne-Sophie Birot. | Absorbing character study by André Turpin . | +| An enthralling, entertaining feature. | A slick, engrossing melodrama. | + +## Getting Started + +### Install Required Packages +First ensure that you have installed the following required packages: + +* **Bazel** ([instructions](http://bazel.build/docs/install.html)) +* **TensorFlow** ([instructions](https://www.tensorflow.org/install/)) +* **NumPy** ([instructions](http://www.scipy.org/install.html)) +* **scikit-learn** ([instructions](http://scikit-learn.org/stable/install.html)) +* **Natural Language Toolkit (NLTK)** + * First install NLTK ([instructions](http://www.nltk.org/install.html)) + * Then install the NLTK data ([instructions](http://www.nltk.org/data.html)) +* **gensim** ([instructions](https://radimrehurek.com/gensim/install.html)) + * Only required if you will be expanding your vocabulary with the [word2vec](https://code.google.com/archive/p/word2vec/) model. + + +### Download Pretrained Models (Optional) + +You can download model checkpoints pretrained on the +[BookCorpus](http://yknzhu.wixsite.com/mbweb) dataset in the following +configurations: + +* Unidirectional RNN encoder ("uni-skip" in the paper) +* Bidirectional RNN encoder ("bi-skip" in the paper) + +```shell +# Directory to download the pretrained models to. +PRETRAINED_MODELS_DIR="${HOME}/skip_thoughts/pretrained/" + +mkdir -p ${PRETRAINED_MODELS_DIR} +cd ${PRETRAINED_MODELS_DIR} + +# Download and extract the unidirectional model. +wget "http://download.tensorflow.org/models/skip_thoughts_uni_2017_02_02.tar.gz" +tar -xvf skip_thoughts_uni_2017_02_02.tar.gz +rm skip_thoughts_uni_2017_02_02.tar.gz + +# Download and extract the bidirectional model. +wget "http://download.tensorflow.org/models/skip_thoughts_bi_2017_02_16.tar.gz" +tar -xvf skip_thoughts_bi_2017_02_16.tar.gz +rm skip_thoughts_bi_2017_02_16.tar.gz +``` + +You can now skip to the sections [Evaluating a Model](#evaluating-a-model) and +[Encoding Sentences](#encoding-sentences). + + +## Training a Model + +### Prepare the Training Data + +To train a model you will need to provide training data in TFRecord format. The +TFRecord format consists of a set of sharded files containing serialized +`tf.Example` protocol buffers. Each `tf.Example` proto contains three +sentences: + + * `encode`: The sentence to encode. + * `decode_pre`: The sentence preceding `encode` in the original text. + * `decode_post`: The sentence following `encode` in the original text. + +Each sentence is a list of words. During preprocessing, a dictionary is created +that assigns each word in the vocabulary to an integer-valued id. Each sentence +is encoded as a list of integer word ids in the `tf.Example` protos. + +We have provided a script to preprocess any set of text-files into this format. +You may wish to use the [BookCorpus](http://yknzhu.wixsite.com/mbweb) dataset. +Note that the preprocessing script may take **12 hours** or more to complete +on this large dataset. + +```shell +# Comma-separated list of globs matching the input input files. The format of +# the input files is assumed to be a list of newline-separated sentences, where +# each sentence is already tokenized. +INPUT_FILES="${HOME}/skip_thoughts/bookcorpus/*.txt" + +# Location to save the preprocessed training and validation data. +DATA_DIR="${HOME}/skip_thoughts/data" + +# Build the preprocessing script. +cd tensorflow-models/skip_thoughts +bazel build -c opt //skip_thoughts/data:preprocess_dataset + +# Run the preprocessing script. +bazel-bin/skip_thoughts/data/preprocess_dataset \ + --input_files=${INPUT_FILES} \ + --output_dir=${DATA_DIR} +``` + +When the script finishes you will find 100 training files and 1 validation file +in `DATA_DIR`. The files will match the patterns `train-?????-of-00100` and +`validation-00000-of-00001` respectively. + +The script will also produce a file named `vocab.txt`. The format of this file +is a list of newline-separated words where the word id is the corresponding 0- +based line index. Words are sorted by descending order of frequency in the input +data. Only the top 20,000 words are assigned unique ids; all other words are +assigned the "unknown id" of 1 in the processed data. + +### Run the Training Script + +Execute the following commands to start the training script. By default it will +run for 500k steps (around 9 days on a GeForce GTX 1080 GPU). + +```shell +# Directory containing the preprocessed data. +DATA_DIR="${HOME}/skip_thoughts/data" + +# Directory to save the model. +MODEL_DIR="${HOME}/skip_thoughts/model" + +# Build the model. +cd tensorflow-models/skip_thoughts +bazel build -c opt //skip_thoughts/... + +# Run the training script. +bazel-bin/skip_thoughts/train \ + --input_file_pattern="${DATA_DIR}/train-?????-of-00100" \ + --train_dir="${MODEL_DIR}/train" +``` + +### Track Training Progress + +Optionally, you can run the `track_perplexity` script in a separate process. +This will log per-word perplexity on the validation set which allows training +progress to be monitored on +[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard). + +Note that you may run out of memory if you run the this script on the same GPU +as the training script. You can set the environment variable +`CUDA_VISIBLE_DEVICES=""` to force the script to run on CPU. If it runs too +slowly on CPU, you can decrease the value of `--num_eval_examples`. + +```shell +DATA_DIR="${HOME}/skip_thoughts/data" +MODEL_DIR="${HOME}/skip_thoughts/model" + +# Ignore GPU devices (only necessary if your GPU is currently memory +# constrained, for example, by running the training script). +export CUDA_VISIBLE_DEVICES="" + +# Run the evaluation script. This will run in a loop, periodically loading the +# latest model checkpoint file and computing evaluation metrics. +bazel-bin/skip_thoughts/track_perplexity \ + --input_file_pattern="${DATA_DIR}/validation-?????-of-00001" \ + --checkpoint_dir="${MODEL_DIR}/train" \ + --eval_dir="${MODEL_DIR}/val" \ + --num_eval_examples=50000 +``` + +If you started the `track_perplexity` script, run a +[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard) +server in a separate process for real-time monitoring of training summaries and +validation perplexity. + +```shell +MODEL_DIR="${HOME}/skip_thoughts/model" + +# Run a TensorBoard server. +tensorboard --logdir="${MODEL_DIR}" +``` + +## Expanding the Vocabulary + +### Overview + +The vocabulary generated by the preprocessing script contains only 20,000 words +which is insufficient for many tasks. For example, a sentence from Wikipedia +might contain nouns that do not appear in this vocabulary. + +A solution to this problem described in the +[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf) +paper is to learn a mapping that transfers word representations from one model to +another. This idea is based on the "Translation Matrix" method from the paper +[Exploiting Similarities Among Languages for Machine Translation](https://arxiv.org/abs/1309.4168). + + +Specifically, we will load the word embeddings from a trained *Skip-Thoughts* +model and from a trained [word2vec model](https://arxiv.org/pdf/1301.3781.pdf) +(which has a much larger vocabulary). We will train a linear regression model +without regularization to learn a linear mapping from the word2vec embedding +space to the *Skip-Thoughts* embedding space. We will then apply the linear +model to all words in the word2vec vocabulary, yielding vectors in the *Skip- +Thoughts* word embedding space for the union of the two vocabularies. + +The linear regression task is to learn a parameter matrix *W* to minimize +*|| X - Y \* W ||2*, where *X* is a matrix of *Skip-Thoughts* +embeddings of shape `[num_words, dim1]`, *Y* is a matrix of word2vec embeddings +of shape `[num_words, dim2]`, and *W* is a matrix of shape `[dim2, dim1]`. + +### Preparation + +First you will need to download and unpack a pretrained +[word2vec model](https://arxiv.org/pdf/1301.3781.pdf) from +[this website](https://code.google.com/archive/p/word2vec/) +([direct download link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)). +This model was trained on the Google News dataset (about 100 billion words). + + +Also ensure that you have already [installed gensim](https://radimrehurek.com/gensim/install.html). + +### Run the Vocabulary Expansion Script + +```shell +# Path to checkpoint file or a directory containing checkpoint files (the script +# will select the most recent). +CHECKPOINT_PATH="${HOME}/skip_thoughts/model/train" + +# Vocabulary file generated by the preprocessing script. +SKIP_THOUGHTS_VOCAB="${HOME}/skip_thoughts/data/vocab.txt" + +# Path to downloaded word2vec model. +WORD2VEC_MODEL="${HOME}/skip_thoughts/googlenews/GoogleNews-vectors-negative300.bin" + +# Output directory. +EXP_VOCAB_DIR="${HOME}/skip_thoughts/exp_vocab" + +# Build the vocabulary expansion script. +cd tensorflow-models/skip_thoughts +bazel build -c opt //skip_thoughts:vocabulary_expansion + +# Run the vocabulary expansion script. +bazel-bin/skip_thoughts/vocabulary_expansion \ + --skip_thoughts_model=${CHECKPOINT_PATH} \ + --skip_thoughts_vocab=${SKIP_THOUGHTS_VOCAB} \ + --word2vec_model=${WORD2VEC_MODEL} \ + --output_dir=${EXP_VOCAB_DIR} +``` + +## Evaluating a Model + +### Overview + +The model can be evaluated using the benchmark tasks described in the +[Skip-Thought Vectors](https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf) +paper. The following tasks are supported (refer to the paper for full details): + + * **SICK** semantic relatedness task. + * **MSRP** (Microsoft Research Paraphrase Corpus) paraphrase detection task. + * Binary classification tasks: + * **MR** movie review sentiment task. + * **CR** customer product review task. + * **SUBJ** subjectivity/objectivity task. + * **MPQA** opinion polarity task. + * **TREC** question-type classification task. + +### Preparation + +You will need to clone or download the +[skip-thoughts GitHub repository](https://github.com/ryankiros/skip-thoughts) by +[ryankiros](https://github.com/ryankiros) (the first author of the Skip-Thoughts +paper): + +```shell +# Folder to clone the repository to. +ST_KIROS_DIR="${HOME}/skip_thoughts/skipthoughts_kiros" + +# Clone the repository. +git clone git@github.com:ryankiros/skip-thoughts.git "${ST_KIROS_DIR}/skipthoughts" + +# Make the package importable. +export PYTHONPATH="${ST_KIROS_DIR}/:${PYTHONPATH}" +``` + +You will also need to download the data needed for each evaluation task. See the +instructions [here](https://github.com/ryankiros/skip-thoughts). + +For example, the CR (customer review) dataset is found [here](http://nlp.stanford.edu/~sidaw/home/projects:nbsvm). For this task we want the +files `custrev.pos` and `custrev.neg`. + +### Run the Evaluation Tasks + +In the following example we will evaluate a unidirectional model ("uni-skip" in +the paper) on the CR task. To use a bidirectional model ("bi-skip" in the +paper), simply pass the flags `--bi_vocab_file`, `--bi_embeddings_file` and +`--bi_checkpoint_path` instead. To use the "combine-skip" model described in the +paper you will need to pass both the unidirectional and bidirectional flags. + +```shell +# Path to checkpoint file or a directory containing checkpoint files (the script +# will select the most recent). +CHECKPOINT_PATH="${HOME}/skip_thoughts/model/train" + +# Vocabulary file generated by the vocabulary expansion script. +VOCAB_FILE="${HOME}/skip_thoughts/exp_vocab/vocab.txt" + +# Embeddings file generated by the vocabulary expansion script. +EMBEDDINGS_FILE="${HOME}/skip_thoughts/exp_vocab/embeddings.npy" + +# Directory containing files custrev.pos and custrev.neg. +EVAL_DATA_DIR="${HOME}/skip_thoughts/eval_data" + +# Build the evaluation script. +cd tensorflow-models/skip_thoughts +bazel build -c opt //skip_thoughts:evaluate + +# Run the evaluation script. +bazel-bin/skip_thoughts/evaluate \ + --eval_task=CR \ + --data_dir=${EVAL_DATA_DIR} \ + --uni_vocab_file=${VOCAB_FILE} \ + --uni_embeddings_file=${EMBEDDINGS_FILE} \ + --uni_checkpoint_path=${CHECKPOINT_PATH} +``` + +Output: + +```python +[0.82539682539682535, 0.84084880636604775, 0.83023872679045096, + 0.86206896551724133, 0.83554376657824936, 0.85676392572944293, + 0.84084880636604775, 0.83023872679045096, 0.85145888594164454, + 0.82758620689655171] +``` + +The output is a list of accuracies of 10 cross-validation classification models. +To get a single number, simply take the average: + +```python +ipython # Launch iPython. + +In [0]: +import numpy as np +np.mean([0.82539682539682535, 0.84084880636604775, 0.83023872679045096, + 0.86206896551724133, 0.83554376657824936, 0.85676392572944293, + 0.84084880636604775, 0.83023872679045096, 0.85145888594164454, + 0.82758620689655171]) + +Out [0]: 0.84009936423729525 +``` + +## Encoding Sentences + +In this example we will encode data from the +[movie review dataset](https://www.cs.cornell.edu/people/pabo/movie-review-data/) +(specifically the [sentence polarity dataset v1.0](https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)). + +```python +ipython # Launch iPython. + +In [0]: + +# Imports. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +import os.path +import scipy.spatial.distance as sd +from skip_thoughts import configuration +from skip_thoughts import encoder_manager + +In [1]: +# Set paths to the model. +VOCAB_FILE = "/path/to/vocab.txt" +EMBEDDING_MATRIX_FILE = "/path/to/embeddings.npy" +CHECKPOINT_PATH = "/path/to/model.ckpt-9999" +# The following directory should contain files rt-polarity.neg and +# rt-polarity.pos. +MR_DATA_DIR = "/dir/containing/mr/data" + +In [2]: +# Set up the encoder. Here we are using a single unidirectional model. +# To use a bidirectional model as well, call load_model() again with +# configuration.model_config(bidirectional_encoder=True) and paths to the +# bidirectional model's files. The encoder will use the concatenation of +# all loaded models. +encoder = encoder_manager.EncoderManager() +encoder.load_model(configuration.model_config(), + vocabulary_file=VOCAB_FILE, + embedding_matrix_file=EMBEDDING_MATRIX_FILE, + checkpoint_path=CHECKPOINT_PATH) + +In [3]: +# Load the movie review dataset. +data = [] +with open(os.path.join(MR_DATA_DIR, 'rt-polarity.neg'), 'rb') as f: + data.extend([line.decode('latin-1').strip() for line in f]) +with open(os.path.join(MR_DATA_DIR, 'rt-polarity.pos'), 'rb') as f: + data.extend([line.decode('latin-1').strip() for line in f]) + +In [4]: +# Generate Skip-Thought Vectors for each sentence in the dataset. +encodings = encoder.encode(data) + +In [5]: +# Define a helper function to generate nearest neighbors. +def get_nn(ind, num=10): + encoding = encodings[ind] + scores = sd.cdist([encoding], encodings, "cosine")[0] + sorted_ids = np.argsort(scores) + print("Sentence:") + print("", data[ind]) + print("\nNearest neighbors:") + for i in range(1, num + 1): + print(" %d. %s (%.3f)" % + (i, data[sorted_ids[i]], scores[sorted_ids[i]])) + +In [6]: +# Compute nearest neighbors of the first sentence in the dataset. +get_nn(0) +``` + +Output: + +``` +Sentence: + simplistic , silly and tedious . + +Nearest neighbors: + 1. trite , banal , cliched , mostly inoffensive . (0.247) + 2. banal and predictable . (0.253) + 3. witless , pointless , tasteless and idiotic . (0.272) + 4. loud , silly , stupid and pointless . (0.295) + 5. grating and tedious . (0.299) + 6. idiotic and ugly . (0.330) + 7. black-and-white and unrealistic . (0.335) + 8. hopelessly inane , humorless and under-inspired . (0.335) + 9. shallow , noisy and pretentious . (0.340) + 10. . . . unlikable , uninteresting , unfunny , and completely , utterly inept . (0.346) +``` \ No newline at end of file diff --git a/skip_thoughts/__init__.py b/skip_thoughts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/skip_thoughts/configuration.py b/skip_thoughts/configuration.py new file mode 100644 index 0000000..db76406 --- /dev/null +++ b/skip_thoughts/configuration.py @@ -0,0 +1,146 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Changes by Babylon Partners +# - Added decoder configs +# - Added pretrained embeddings configs +# ============================================================================== +"""Default configuration for model architecture and training.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +class _HParams(object): + """Wrapper for configuration parameters.""" + pass + + +def model_config(input_file_pattern=None, + input_queue_capacity=640000, + num_input_reader_threads=1, + shuffle_input_data=True, + uniform_init_scale=0.1, + vocab_size=20000, + batch_size=128, + word_embedding_dim=620, + pretrained_word_emb_file=None, + word_emb_trainable=False, + bidirectional_encoder=False, + encoder_dim=2400, + skipgram_encoder=False, + sequence_decoder_pre=True, + sequence_decoder_cur=False, + sequence_decoder_post=True, + skipgram_decoder_pre=True, + skipgram_decoder_cur=False, + skipgram_decoder_post=True, + share_weights_logits=True, + normalise_decoder_losses=False, + skipgram_prefactor=1., + sequence_prefactor=1.): + """Creates a model configuration object. + + Args: + input_file_pattern: File pattern of sharded TFRecord files containing + tf.Example protobufs. + input_queue_capacity: Number of examples to keep in the input queue. + num_input_reader_threads: Number of threads for prefetching input + tf.Examples. + shuffle_input_data: Whether to shuffle the input data. + uniform_init_scale: Scale of random uniform initializer. + vocab_size: Number of unique words in the vocab. + batch_size: Batch size (training and evaluation only). + word_embedding_dim: Word embedding dimension. + bidirectional_encoder: Whether to use a bidirectional or unidirectional + encoder RNN. + encoder_dim: Number of output dimensions of the sentence encoder. + + Returns: + An object containing model configuration parameters. + """ + config = _HParams() + config.input_file_pattern = input_file_pattern + config.input_queue_capacity = input_queue_capacity + config.num_input_reader_threads = num_input_reader_threads + config.shuffle_input_data = shuffle_input_data + config.uniform_init_scale = uniform_init_scale + config.vocab_size = vocab_size + config.batch_size = batch_size + config.word_embedding_dim = word_embedding_dim + config.pretrained_word_emb_file = pretrained_word_emb_file + config.word_emb_trainable = word_emb_trainable + config.bidirectional_encoder = bidirectional_encoder + config.encoder_dim = encoder_dim + config.skipgram_encoder = skipgram_encoder + config.sequence_decoder_pre = sequence_decoder_pre + config.sequence_decoder_cur = sequence_decoder_cur + config.sequence_decoder_post = sequence_decoder_post + config.skipgram_decoder_pre = skipgram_decoder_pre + config.skipgram_decoder_cur = skipgram_decoder_cur + config.skipgram_decoder_post = skipgram_decoder_post + config.share_weights_logits = share_weights_logits + config.normalise_decoder_losses = normalise_decoder_losses + config.skipgram_prefactor = skipgram_prefactor + config.sequence_prefactor = sequence_prefactor + config.num_skipgram_decoders = sum((config.skipgram_decoder_pre, + config.skipgram_decoder_cur, + config.skipgram_decoder_post)) + config.num_sequence_decoders = sum((config.sequence_decoder_pre, + config.sequence_decoder_cur, + config.sequence_decoder_post)) + return config + + +def training_config(learning_rate=0.0008, + learning_rate_decay_factor=0.5, + learning_rate_decay_steps=400000, + number_of_steps=500000, + clip_gradient_norm=5.0, + save_model_secs=600, + save_summaries_secs=600): + """Creates a training configuration object. + + Args: + learning_rate: Initial learning rate. + learning_rate_decay_factor: If > 0, the learning rate decay factor. + learning_rate_decay_steps: The number of steps before the learning rate + decays by learning_rate_decay_factor. + number_of_steps: The total number of training steps to run. Passing None + will cause the training script to run indefinitely. + clip_gradient_norm: If not None, then clip gradients to this value. + save_model_secs: How often (in seconds) to save model checkpoints. + save_summaries_secs: How often (in seconds) to save model summaries. + + Returns: + An object containing training configuration parameters. + + Raises: + ValueError: If learning_rate_decay_factor is set and + learning_rate_decay_steps is unset. + """ + if learning_rate_decay_factor and not learning_rate_decay_steps: + raise ValueError( + "learning_rate_decay_factor requires learning_rate_decay_steps.") + + config = _HParams() + config.learning_rate = learning_rate + config.learning_rate_decay_factor = learning_rate_decay_factor + config.learning_rate_decay_steps = learning_rate_decay_steps + config.number_of_steps = number_of_steps + config.clip_gradient_norm = clip_gradient_norm + config.save_model_secs = save_model_secs + config.save_summaries_secs = save_summaries_secs + return config diff --git a/skip_thoughts/data/BUILD b/skip_thoughts/data/BUILD new file mode 100644 index 0000000..afc209e --- /dev/null +++ b/skip_thoughts/data/BUILD @@ -0,0 +1,23 @@ +package(default_visibility = ["//skip_thoughts:internal"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +py_library( + name = "special_words", + srcs = ["special_words.py"], + srcs_version = "PY2AND3", + deps = [], +) + +py_binary( + name = "preprocess_dataset", + srcs = [ + "preprocess_dataset.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":special_words", + ], +) diff --git a/skip_thoughts/data/__init__.py b/skip_thoughts/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/skip_thoughts/data/preprocess_dataset.py b/skip_thoughts/data/preprocess_dataset.py new file mode 100644 index 0000000..5764306 --- /dev/null +++ b/skip_thoughts/data/preprocess_dataset.py @@ -0,0 +1,308 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Changes by Babylon Partners +# - Added _string_to_words +# ============================================================================== +"""Converts a set of text files to TFRecord format with Example protos. + +Each Example proto in the output contains the following fields: + + decode_pre: list of int64 ids corresponding to the "previous" sentence. + encode: list of int64 ids corresponding to the "current" sentence. + decode_post: list of int64 ids corresponding to the "post" sentence. + +In addition, the following files are generated: + + vocab.txt: List of " " pairs, where is the integer + encoding of in the Example protos. + word_counts.txt: List of " " pairs, where is the number + of occurrences of in the input files. + +The vocabulary of word ids is constructed from the top --num_words by word +count. All other words get the word id. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os + + +import numpy as np +import tensorflow as tf + +from skip_thoughts.data import special_words + +FLAGS = tf.flags.FLAGS + +tf.flags.DEFINE_string("input_files", None, + "Comma-separated list of globs matching the input " + "files. The format of the input files is assumed to be " + "a list of newline-separated sentences, where each " + "sentence is already tokenized.") + +tf.flags.DEFINE_string("vocab_file", "", + "(Optional) existing vocab file. Otherwise, a new vocab " + "file is created and written to the output directory. " + "The file format is a list of newline-separated words, " + "where the word id is the corresponding 0-based index " + "in the file.") + +tf.flags.DEFINE_string("output_dir", None, "Output directory.") + +tf.flags.DEFINE_integer("train_output_shards", 100, + "Number of output shards for the training set.") + +tf.flags.DEFINE_integer("validation_output_shards", 1, + "Number of output shards for the validation set.") + +tf.flags.DEFINE_integer("num_validation_sentences", 50000, + "Number of output shards for the validation set.") + +tf.flags.DEFINE_integer("num_words", 20000, + "Number of words to include in the output.") + +tf.flags.DEFINE_integer("max_sentences", 0, + "If > 0, the maximum number of sentences to output.") + +tf.flags.DEFINE_integer("max_sentence_length", 30, + "If > 0, exclude sentences whose encode, decode_pre OR" + "decode_post sentence exceeds this length.") + +tf.flags.DEFINE_boolean("add_eos", True, + "Whether to add end-of-sentence ids to the output.") + +tf.logging.set_verbosity(tf.logging.INFO) + + +def _string_to_words(sentence): + return [w.strip() for w in sentence.split()] + + +def _build_vocabulary(input_files): + """Loads or builds the model vocabulary. + + Args: + input_files: List of pre-tokenized input .txt files. + + Returns: + vocab: A dictionary of word to id. + """ + if FLAGS.vocab_file: + tf.logging.info("Loading existing vocab file.") + vocab = collections.OrderedDict() + with tf.gfile.GFile(FLAGS.vocab_file, mode="r") as f: + for i, line in enumerate(f): + word = line.decode("utf-8").strip() + assert word not in vocab, "Attempting to add word twice: %s" % word + vocab[word] = i + tf.logging.info("Read vocab of size %d from %s", + len(vocab), FLAGS.vocab_file) + return vocab + + tf.logging.info("Creating vocabulary.") + num = 0 + wordcount = collections.Counter() + for input_file in input_files: + tf.logging.info("Processing file: %s", input_file) + for sentence in tf.gfile.FastGFile(input_file): + wordcount.update(_string_to_words(sentence)) + + num += 1 + if num % 1000000 == 0: + tf.logging.info("Processed %d sentences", num) + + tf.logging.info("Processed %d sentences total", num) + + words = wordcount.keys() + freqs = wordcount.values() + sorted_indices = np.argsort(freqs)[::-1] + + vocab = collections.OrderedDict() + vocab[special_words.EOS] = special_words.EOS_ID + vocab[special_words.UNK] = special_words.UNK_ID + for w_id, w_index in enumerate(sorted_indices[0:FLAGS.num_words - 2]): + vocab[words[w_index]] = w_id + 2 # 0: EOS, 1: UNK. + + tf.logging.info("Created vocab with %d words", len(vocab)) + + vocab_file = os.path.join(FLAGS.output_dir, "vocab.txt") + with tf.gfile.FastGFile(vocab_file, "w") as f: + f.write("\n".join(vocab.keys())) + tf.logging.info("Wrote vocab file to %s", vocab_file) + + word_counts_file = os.path.join(FLAGS.output_dir, "word_counts.txt") + with tf.gfile.FastGFile(word_counts_file, "w") as f: + for i in sorted_indices: + f.write("%s %d\n" % (words[i], freqs[i])) + tf.logging.info("Wrote word counts file to %s", word_counts_file) + + return vocab + + +def _int64_feature(value): + """Helper for creating an Int64 Feature.""" + return tf.train.Feature(int64_list=tf.train.Int64List( + value=[int(v) for v in value])) + + +def _sentence_to_ids(sentence, vocab): + """Helper for converting a sentence (list of words) to a list of ids.""" + ids = [vocab.get(w, special_words.UNK_ID) for w in sentence] + if FLAGS.add_eos: + ids.append(special_words.EOS_ID) + return ids + + +def _create_serialized_example(predecessor, current, successor, vocab): + """Helper for creating a serialized Example proto.""" + example = tf.train.Example(features=tf.train.Features(feature={ + "decode_pre": _int64_feature(_sentence_to_ids(predecessor, vocab)), + "encode": _int64_feature(_sentence_to_ids(current, vocab)), + "decode_post": _int64_feature(_sentence_to_ids(successor, vocab)), + })) + + return example.SerializeToString() + + +def _process_input_file(filename, vocab, stats): + """Processes the sentences in an input file. + + Args: + filename: Path to a pre-tokenized input .txt file. + vocab: A dictionary of word to id. + stats: A Counter object for statistics. + + Returns: + processed: A list of serialized Example protos + """ + tf.logging.info("Processing input file: %s", filename) + processed = [] + + predecessor = None # Predecessor sentence (list of words). + current = None # Current sentence (list of words). + successor = None # Successor sentence (list of words). + + for successor_str in tf.gfile.FastGFile(filename): + stats.update(["sentences_seen"]) + successor = _string_to_words(successor_str) + + # The first 2 sentences per file will be skipped. + if predecessor and current and successor: + stats.update(["sentences_considered"]) + + # Note that we are going to insert later, so we only allow + # sentences with strictly less than max_sentence_length to pass. + if FLAGS.max_sentence_length and ( + len(predecessor) >= FLAGS.max_sentence_length or len(current) >= + FLAGS.max_sentence_length or len(successor) >= + FLAGS.max_sentence_length): + stats.update(["sentences_too_long"]) + else: + serialized = _create_serialized_example(predecessor, current, successor, + vocab) + processed.append(serialized) + stats.update(["sentences_output"]) + + predecessor = current + current = successor + + sentences_seen = stats["sentences_seen"] + sentences_output = stats["sentences_output"] + if sentences_seen and sentences_seen % 100000 == 0: + tf.logging.info("Processed %d sentences (%d output)", sentences_seen, + sentences_output) + if FLAGS.max_sentences and sentences_output >= FLAGS.max_sentences: + break + + tf.logging.info("Completed processing file %s", filename) + return processed + + +def _write_shard(filename, dataset, indices): + """Writes a TFRecord shard.""" + with tf.python_io.TFRecordWriter(filename) as writer: + for j in indices: + writer.write(dataset[j]) + + +def _write_dataset(name, dataset, indices, num_shards): + """Writes a sharded TFRecord dataset. + + Args: + name: Name of the dataset (e.g. "train"). + dataset: List of serialized Example protos. + indices: List of indices of 'dataset' to be written. + num_shards: The number of output shards. + """ + tf.logging.info("Writing dataset %s", name) + borders = np.int32(np.linspace(0, len(indices), num_shards + 1)) + for i in range(num_shards): + filename = os.path.join(FLAGS.output_dir, "%s-%.5d-of-%.5d" % (name, i, + num_shards)) + shard_indices = indices[borders[i]:borders[i + 1]] + _write_shard(filename, dataset, shard_indices) + tf.logging.info("Wrote dataset indices [%d, %d) to output shard %s", + borders[i], borders[i + 1], filename) + tf.logging.info("Finished writing %d sentences in dataset %s.", + len(indices), name) + + +def main(unused_argv): + if not FLAGS.input_files: + raise ValueError("--input_files is required.") + if not FLAGS.output_dir: + raise ValueError("--output_dir is required.") + + if not tf.gfile.IsDirectory(FLAGS.output_dir): + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] + for pattern in FLAGS.input_files.split(","): + match = tf.gfile.Glob(FLAGS.input_files) + if not match: + raise ValueError("Found no files matching %s" % pattern) + input_files.extend(match) + tf.logging.info("Found %d input files.", len(input_files)) + + vocab = _build_vocabulary(input_files) + + tf.logging.info("Generating dataset.") + stats = collections.Counter() + dataset = [] + for filename in input_files: + dataset.extend(_process_input_file(filename, vocab, stats)) + if FLAGS.max_sentences and stats["sentences_output"] >= FLAGS.max_sentences: + break + + tf.logging.info("Generated dataset with %d sentences.", len(dataset)) + for k, v in stats.items(): + tf.logging.info("%s: %d", k, v) + + tf.logging.info("Shuffling dataset.") + np.random.seed(123) + shuffled_indices = np.random.permutation(len(dataset)) + val_indices = shuffled_indices[:FLAGS.num_validation_sentences] + train_indices = shuffled_indices[FLAGS.num_validation_sentences:] + + _write_dataset("train", dataset, train_indices, FLAGS.train_output_shards) + _write_dataset("validation", dataset, val_indices, + FLAGS.validation_output_shards) + + +if __name__ == "__main__": + tf.app.run() diff --git a/skip_thoughts/data/special_words.py b/skip_thoughts/data/special_words.py new file mode 100644 index 0000000..fb76b7a --- /dev/null +++ b/skip_thoughts/data/special_words.py @@ -0,0 +1,27 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Special word constants. + +NOTE: The ids of the EOS and UNK constants should not be modified. It is assumed +that these always occupy the first two ids. +""" + +# End of sentence. +EOS = "" +EOS_ID = 0 + +# Unknown. +UNK = "" +UNK_ID = 1 diff --git a/skip_thoughts/decode.py b/skip_thoughts/decode.py new file mode 100644 index 0000000..b7bcf10 --- /dev/null +++ b/skip_thoughts/decode.py @@ -0,0 +1,161 @@ +# Copyright 2018 Babylon Partners. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + +from skip_thoughts import skip_thoughts_encoder + + +class Decoder: + def __init__( + self, + g, + tensor_names_decoder, + tensor_names_global): + self.g = g + self.tensors_decoder = _names_to_tensors( + g=self.g, x=tensor_names_decoder) + self.tensors_global = _names_to_tensors( + g=self.g, x=tensor_names_global) + + self.state = self.tensors_decoder['decoder_state'] + + self.embedding_dim = tf.shape( + self.tensors_global['word_embedding'])[-1] + + self.batch_size, self.decoder_seq_len = _get_batch_seq_len( + self.tensors_decoder['decoder_output']) + + self.softmax_flat = tf.nn.softmax( + logits=self.tensors_decoder['logits']) + + self.softmax_w_flat = tf.matmul( + self.softmax_flat, self.tensors_global['word_embedding']) + + self.softmax_w = tf.reshape( + self.softmax_w_flat, + (self.batch_size, self.decoder_seq_len, self.embedding_dim)) + + +def _is_tensor_name(x): + if ":" in x: + return True + return False + + +def _get_tensor_or_op(g, x): + if _is_tensor_name(x): + return g.get_tensor_by_name(x) + + return g.get_operation_by_name(x) + + +def _get_batch_seq_len(x): + x_sh = tf.shape(x) + return x_sh[0], x_sh[1] + + +def _names_to_tensors(g, x): + return {k: _get_tensor_or_op(g, v) for k, v in x.iteritems()} + + +def unroll_decoder( + sess, + encoder_embeddings, + encoder_mask, + decoder_name, + decoder_softmax_w_embs, + decoder_state, + steps=5): + n_input_sequences = encoder_embeddings.shape[0] + sequence_dim = encoder_embeddings.shape[2] + start_tokens = np.zeros(shape=(n_input_sequences, 1, sequence_dim)) + decoder_input = start_tokens + feed_dict = { + "encode_emb:0": encoder_embeddings, + "encode_mask:0": encoder_mask} + dec_emb_feed = "{decoder_name}_emb:0".format(decoder_name=decoder_name) + dec_mask_feed = "{decoder_name}_mask:0".format(decoder_name=decoder_name) + all_states = None + + for _ in range(steps): + len_seq = decoder_input.shape[1] + decode_mask = np.ones((n_input_sequences, len_seq)) + feed_dict.update({dec_emb_feed: decoder_input, + dec_mask_feed: decode_mask}) + softmax_w_embs, states = sess.run( + (decoder_softmax_w_embs, decoder_state), + feed_dict=feed_dict) + states_expanded = np.expand_dims(states, axis=1) + if all_states is None: + all_states = states_expanded + else: + all_states = np.concatenate((all_states, states_expanded), axis=1) + decoder_input = np.concatenate((start_tokens, softmax_w_embs), axis=1) + return all_states + + +def decode(sess, + data, + encoder, + decoder_pre, + decoder_post, + use_norm=True, + verbose=True, + batch_size=128, + use_eos=False, + steps=5): + data = encoder.encoders[0]._preprocess(data=data, use_eos=use_eos) + + pre_states = [] + post_states = [] + + batch_indices = np.arange(0, len(data), batch_size) + for batch, start_index in enumerate(batch_indices): + if verbose: + tf.logging.info("Batch %d / %d.", batch, len(batch_indices)) + + (encoder_embeddings, + encoder_mask) = skip_thoughts_encoder._batch_and_pad( + data[start_index:start_index + batch_size]) + + pre_states.extend(unroll_decoder( + sess=sess, + encoder_embeddings=encoder_embeddings, + encoder_mask=encoder_mask, + decoder_name='decode_pre', + decoder_softmax_w_embs=decoder_pre.softmax_w, + decoder_state=decoder_pre.state, + steps=steps)) + + post_states.extend(unroll_decoder( + sess=sess, + encoder_embeddings=encoder_embeddings, + encoder_mask=encoder_mask, + decoder_name='decode_post', + decoder_softmax_w_embs=decoder_post.softmax_w, + decoder_state=decoder_post.state, + steps=steps)) + + if use_norm: + pre_states = [v / np.linalg.norm(v) for v in pre_states] + post_states = [v / np.linalg.norm(v) for v in post_states] + + return pre_states, post_states diff --git a/skip_thoughts/encoder_manager.py b/skip_thoughts/encoder_manager.py new file mode 100644 index 0000000..55ae264 --- /dev/null +++ b/skip_thoughts/encoder_manager.py @@ -0,0 +1,140 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Changes by Babylon Partners +# - Added new encode-decode mode +# ============================================================================== +"""Manager class for loading and encoding with multiple skip-thoughts models. + +If multiple models are loaded at once then the encode() function returns the +concatenation of the outputs of each model. + +Example usage: + manager = EncoderManager() + manager.load_model(model_config_1, vocabulary_file_1, embedding_matrix_file_1, + checkpoint_path_1) + manager.load_model(model_config_2, vocabulary_file_2, embedding_matrix_file_2, + checkpoint_path_2) + encodings = manager.encode(data) +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + + +import numpy as np +import tensorflow as tf + +from skip_thoughts import skip_thoughts_encoder + + +class EncoderManager(object): + """Manager class for loading and encoding with skip-thoughts models.""" + + def __init__(self): + self.encoders = [] + self.sessions = [] + self.graph = None + + def load_model(self, model_config, vocabulary_file, embedding_matrix_file, + checkpoint_path, mode='encode'): + """Loads a skip-thoughts model. + + Args: + model_config: Object containing parameters for building the model. + vocabulary_file: Path to vocabulary file containing a list of newline- + separated words where the word id is the corresponding 0-based index in + the file. + embedding_matrix_file: Path to a serialized numpy array of shape + [vocab_size, embedding_dim]. + checkpoint_path: SkipThoughtsModel checkpoint file or a directory + containing a checkpoint file. + """ + tf.logging.info("Reading vocabulary from %s", vocabulary_file) + with tf.gfile.GFile(vocabulary_file, mode="r") as f: + lines = list(f.readlines()) + reverse_vocab = [line.decode("utf-8").strip() for line in lines] + tf.logging.info("Loaded vocabulary with %d words.", len(reverse_vocab)) + + tf.logging.info("Loading embedding matrix from %s", embedding_matrix_file) + # Note: tf.gfile.GFile doesn't work here because np.load() calls f.seek() + # with 3 arguments. + with open(embedding_matrix_file, "r") as f: + embedding_matrix = np.load(f) + tf.logging.info("Loaded embedding matrix with shape %s", + embedding_matrix.shape) + + word_embeddings = collections.OrderedDict( + zip(reverse_vocab, embedding_matrix)) + + g = tf.Graph() + with g.as_default(): + encoder = skip_thoughts_encoder.SkipThoughtsEncoder(word_embeddings) + restore_model = encoder.build_graph_from_config(model_config, + checkpoint_path, + mode=mode) + + self.graph = g + sess = tf.Session(graph=g) + restore_model(sess) + + self.encoders.append(encoder) + self.sessions.append(sess) + + def encode(self, + data, + use_norm=True, + verbose=False, + batch_size=128, + use_eos=False): + """Encodes a sequence of sentences as skip-thought vectors. + + Args: + data: A list of input strings. + use_norm: If True, normalize output skip-thought vectors to unit L2 norm. + verbose: Whether to log every batch. + batch_size: Batch size for the RNN encoders. + use_eos: If True, append the end-of-sentence word to each input sentence. + + Returns: + thought_vectors: A list of numpy arrays corresponding to 'data'. + + Raises: + ValueError: If called before calling load_encoder. + """ + if not self.encoders: + raise ValueError( + "Must call load_model at least once before calling encode.") + + encoded = [] + for encoder, sess in zip(self.encoders, self.sessions): + encoded.append( + np.array( + encoder.encode( + sess, + data, + use_norm=use_norm, + verbose=verbose, + batch_size=batch_size, + use_eos=use_eos))) + + return np.concatenate(encoded, axis=1) + + def close(self): + """Closes the active TensorFlow Sessions.""" + for sess in self.sessions: + sess.close() diff --git a/skip_thoughts/evaluate.py b/skip_thoughts/evaluate.py new file mode 100644 index 0000000..e840d9d --- /dev/null +++ b/skip_thoughts/evaluate.py @@ -0,0 +1,117 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Script to evaluate a skip-thoughts model. + +This script can evaluate a model with a unidirectional encoder ("uni-skip" in +the paper); or a model with a bidirectional encoder ("bi-skip"); or the +combination of a model with a unidirectional encoder and a model with a +bidirectional encoder ("combine-skip"). + +The uni-skip model (if it exists) is specified by the flags +--uni_vocab_file, --uni_embeddings_file, --uni_checkpoint_path. + +The bi-skip model (if it exists) is specified by the flags +--bi_vocab_file, --bi_embeddings_path, --bi_checkpoint_path. + +The evaluation tasks have different running times. SICK may take 5-10 minutes. +MSRP, TREC and CR may take 20-60 minutes. SUBJ, MPQA and MR may take 2+ hours. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +from skipthoughts import eval_classification +from skipthoughts import eval_msrp +from skipthoughts import eval_sick +from skipthoughts import eval_trec +import tensorflow as tf + +from skip_thoughts import configuration +from skip_thoughts import encoder_manager + +FLAGS = tf.flags.FLAGS + +tf.flags.DEFINE_string("eval_task", "CR", + "Name of the evaluation task to run. Available tasks: " + "MR, CR, SUBJ, MPQA, SICK, MSRP, TREC.") + +tf.flags.DEFINE_string("data_dir", None, "Directory containing training data.") + +tf.flags.DEFINE_string("uni_vocab_file", None, + "Path to vocabulary file containing a list of newline-" + "separated words where the word id is the " + "corresponding 0-based index in the file.") +tf.flags.DEFINE_string("bi_vocab_file", None, + "Path to vocabulary file containing a list of newline-" + "separated words where the word id is the " + "corresponding 0-based index in the file.") + +tf.flags.DEFINE_string("uni_embeddings_file", None, + "Path to serialized numpy array of shape " + "[vocab_size, embedding_dim].") +tf.flags.DEFINE_string("bi_embeddings_file", None, + "Path to serialized numpy array of shape " + "[vocab_size, embedding_dim].") + +tf.flags.DEFINE_string("uni_checkpoint_path", None, + "Checkpoint file or directory containing a checkpoint " + "file.") +tf.flags.DEFINE_string("bi_checkpoint_path", None, + "Checkpoint file or directory containing a checkpoint " + "file.") + +tf.logging.set_verbosity(tf.logging.INFO) + + +def main(unused_argv): + if not FLAGS.data_dir: + raise ValueError("--data_dir is required.") + + encoder = encoder_manager.EncoderManager() + + # Maybe load unidirectional encoder. + if FLAGS.uni_checkpoint_path: + print("Loading unidirectional model...") + uni_config = configuration.model_config() + encoder.load_model(uni_config, FLAGS.uni_vocab_file, + FLAGS.uni_embeddings_file, FLAGS.uni_checkpoint_path) + + # Maybe load bidirectional encoder. + if FLAGS.bi_checkpoint_path: + print("Loading bidirectional model...") + bi_config = configuration.model_config(bidirectional_encoder=True) + encoder.load_model(bi_config, FLAGS.bi_vocab_file, FLAGS.bi_embeddings_file, + FLAGS.bi_checkpoint_path) + + if FLAGS.eval_task in ["MR", "CR", "SUBJ", "MPQA"]: + eval_classification.eval_nested_kfold( + encoder, FLAGS.eval_task, FLAGS.data_dir, use_nb=False) + elif FLAGS.eval_task == "SICK": + eval_sick.evaluate(encoder, evaltest=True, loc=FLAGS.data_dir) + elif FLAGS.eval_task == "MSRP": + eval_msrp.evaluate( + encoder, evalcv=True, evaltest=True, use_feats=True, loc=FLAGS.data_dir) + elif FLAGS.eval_task == "TREC": + eval_trec.evaluate(encoder, evalcv=True, evaltest=True, loc=FLAGS.data_dir) + else: + raise ValueError("Unrecognized eval_task: %s" % FLAGS.eval_task) + + encoder.close() + + +if __name__ == "__main__": + tf.app.run() diff --git a/skip_thoughts/experiments.py b/skip_thoughts/experiments.py new file mode 100644 index 0000000..eb6bb13 --- /dev/null +++ b/skip_thoughts/experiments.py @@ -0,0 +1,51 @@ +# Copyright 2018 Babylon Partners. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Easy classes for decoder configurations.""" + + +def get_decoder_config(flags): + n_seq_decoders, n_skipgram_decoders = ( + int(flags.decoder[3]), int(flags.decoder[-1])) + + assert n_seq_decoders in [0, 1, 2, 3] + assert n_skipgram_decoders in [0, 1, 2, 3] + + decoder_config = DecoderConfig() + + if n_seq_decoders in [1, 3]: + decoder_config.sequence_decoder_cur = True + + if n_seq_decoders in [2, 3]: + decoder_config.sequence_decoder_pre = True + decoder_config.sequence_decoder_post = True + + if n_skipgram_decoders in [1, 3]: + decoder_config.skipgram_decoder_cur = True + + if n_skipgram_decoders in [2, 3]: + decoder_config.skipgram_decoder_pre = True + decoder_config.skipgram_decoder_post = True + + return decoder_config + + +class DecoderConfig: + def __init__(self): + self.sequence_decoder_pre = False + self.sequence_decoder_cur = False + self.sequence_decoder_post = False + self.skipgram_decoder_pre = False + self.skipgram_decoder_cur = False + self.skipgram_decoder_post = False diff --git a/skip_thoughts/ops/BUILD b/skip_thoughts/ops/BUILD new file mode 100644 index 0000000..4586e5a --- /dev/null +++ b/skip_thoughts/ops/BUILD @@ -0,0 +1,17 @@ +package(default_visibility = ["//skip_thoughts:internal"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +py_library( + name = "input_ops", + srcs = ["input_ops.py"], + srcs_version = "PY2AND3", +) + +py_library( + name = "gru_cell", + srcs = ["gru_cell.py"], + srcs_version = "PY2AND3", +) diff --git a/skip_thoughts/ops/__init__.py b/skip_thoughts/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/skip_thoughts/ops/gru_cell.py b/skip_thoughts/ops/gru_cell.py new file mode 100644 index 0000000..c4bee46 --- /dev/null +++ b/skip_thoughts/ops/gru_cell.py @@ -0,0 +1,134 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""GRU cell implementation for the skip-thought vectors model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import tensorflow as tf + +_layer_norm = tf.contrib.layers.layer_norm + + +class LayerNormGRUCell(tf.contrib.rnn.RNNCell): + """GRU cell with layer normalization. + + The layer normalization implementation is based on: + + https://arxiv.org/abs/1607.06450. + + "Layer Normalization" + Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton + """ + + def __init__(self, + num_units, + w_initializer, + u_initializer, + b_initializer, + activation=tf.nn.tanh): + """Initializes the cell. + + Args: + num_units: Number of cell units. + w_initializer: Initializer for the "W" (input) parameter matrices. + u_initializer: Initializer for the "U" (recurrent) parameter matrices. + b_initializer: Initializer for the "b" (bias) parameter vectors. + activation: Cell activation function. + """ + self._num_units = num_units + self._w_initializer = w_initializer + self._u_initializer = u_initializer + self._b_initializer = b_initializer + self._activation = activation + + @property + def state_size(self): + return self._num_units + + @property + def output_size(self): + return self._num_units + + def _w_h_initializer(self): + """Returns an initializer for the "W_h" parameter matrix. + + See equation (23) in the paper. The "W_h" parameter matrix is the + concatenation of two parameter submatrices. The matrix returned is + [U_z, U_r]. + + Returns: + A Tensor with shape [num_units, 2 * num_units] as described above. + """ + + def _initializer(shape, dtype=tf.float32, partition_info=None): + num_units = self._num_units + assert shape == [num_units, 2 * num_units] + u_z = self._u_initializer([num_units, num_units], dtype, partition_info) + u_r = self._u_initializer([num_units, num_units], dtype, partition_info) + return tf.concat([u_z, u_r], 1) + + return _initializer + + def _w_x_initializer(self, input_dim): + """Returns an initializer for the "W_x" parameter matrix. + + See equation (23) in the paper. The "W_x" parameter matrix is the + concatenation of two parameter submatrices. The matrix returned is + [W_z, W_r]. + + Args: + input_dim: The dimension of the cell inputs. + + Returns: + A Tensor with shape [input_dim, 2 * num_units] as described above. + """ + + def _initializer(shape, dtype=tf.float32, partition_info=None): + num_units = self._num_units + assert shape == [input_dim, 2 * num_units] + w_z = self._w_initializer([input_dim, num_units], dtype, partition_info) + w_r = self._w_initializer([input_dim, num_units], dtype, partition_info) + return tf.concat([w_z, w_r], 1) + + return _initializer + + def __call__(self, inputs, state, scope=None): + """GRU cell with layer normalization.""" + input_dim = inputs.get_shape().as_list()[1] + num_units = self._num_units + + with tf.variable_scope(scope or "gru_cell"): + with tf.variable_scope("gates"): + w_h = tf.get_variable( + "w_h", [num_units, 2 * num_units], + initializer=self._w_h_initializer()) + w_x = tf.get_variable( + "w_x", [input_dim, 2 * num_units], + initializer=self._w_x_initializer(input_dim)) + z_and_r = (_layer_norm(tf.matmul(state, w_h), scope="layer_norm/w_h") + + _layer_norm(tf.matmul(inputs, w_x), scope="layer_norm/w_x")) + z, r = tf.split(tf.sigmoid(z_and_r), 2, 1) + with tf.variable_scope("candidate"): + w = tf.get_variable( + "w", [input_dim, num_units], initializer=self._w_initializer) + u = tf.get_variable( + "u", [num_units, num_units], initializer=self._u_initializer) + h_hat = (r * _layer_norm(tf.matmul(state, u), scope="layer_norm/u") + + _layer_norm(tf.matmul(inputs, w), scope="layer_norm/w")) + new_h = (1 - z) * state + z * self._activation(h_hat) + return new_h, new_h diff --git a/skip_thoughts/ops/input_ops.py b/skip_thoughts/ops/input_ops.py new file mode 100644 index 0000000..6b84f42 --- /dev/null +++ b/skip_thoughts/ops/input_ops.py @@ -0,0 +1,119 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Input ops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + + +import tensorflow as tf +from tensorflow.python.ops import sparse_ops + +# A SentenceBatch is a pair of Tensors: +# ids: Batch of input sentences represented as sequences of word ids: an int64 +# Tensor with shape [batch_size, padded_length]. +# mask: Boolean mask distinguishing real words (1) from padded words (0): an +# int32 Tensor with shape [batch_size, padded_length]. +SentenceBatch = collections.namedtuple("SentenceBatch", ("ids", "mask")) + + +def parse_example_batch(serialized): + """Parses a batch of tf.Example protos. + + Args: + serialized: A 1-D string Tensor; a batch of serialized tf.Example protos. + Returns: + encode: A SentenceBatch of encode sentences. + decode_pre: A SentenceBatch of "previous" sentences to decode. + decode_post: A SentenceBatch of "post" sentences to decode. + """ + features = tf.parse_example( + serialized, + features={ + "encode": tf.VarLenFeature(dtype=tf.int64), + "decode_pre": tf.VarLenFeature(dtype=tf.int64), + "decode_post": tf.VarLenFeature(dtype=tf.int64), + }) + + def _sparse_to_batch(sparse): + ids = tf.sparse_tensor_to_dense(sparse) # Padding with zeroes. + mask = tf.sparse_to_dense(sparse.indices, sparse.dense_shape, + tf.ones_like(sparse.values, dtype=tf.int32)) + return SentenceBatch(ids=ids, mask=mask) + + output_names = ("encode", "decode_pre", "decode_post") + return tuple(_sparse_to_batch(features[x]) for x in output_names) + + +def prefetch_input_data(reader, + file_pattern, + shuffle, + capacity, + num_reader_threads=1): + """Prefetches string values from disk into an input queue. + + Args: + reader: Instance of tf.ReaderBase. + file_pattern: Comma-separated list of file patterns (e.g. + "/tmp/train_data-?????-of-00100", where '?' acts as a wildcard that + matches any character). + shuffle: Boolean; whether to randomly shuffle the input data. + capacity: Queue capacity (number of records). + num_reader_threads: Number of reader threads feeding into the queue. + + Returns: + A Queue containing prefetched string values. + """ + data_files = [] + for pattern in file_pattern.split(","): + data_files.extend(tf.gfile.Glob(pattern)) + if not data_files: + tf.logging.fatal("Found no input files matching %s", file_pattern) + else: + tf.logging.info("Prefetching values from %d files matching %s", + len(data_files), file_pattern) + + filename_queue = tf.train.string_input_producer( + data_files, shuffle=shuffle, capacity=16, name="filename_queue") + + if shuffle: + min_after_dequeue = int(0.6 * capacity) + values_queue = tf.RandomShuffleQueue( + capacity=capacity, + min_after_dequeue=min_after_dequeue, + dtypes=[tf.string], + shapes=[[]], + name="random_input_queue") + else: + values_queue = tf.FIFOQueue( + capacity=capacity, + dtypes=[tf.string], + shapes=[[]], + name="fifo_input_queue") + + enqueue_ops = [] + for _ in range(num_reader_threads): + _, value = reader.read(filename_queue) + enqueue_ops.append(values_queue.enqueue([value])) + tf.train.queue_runner.add_queue_runner( + tf.train.queue_runner.QueueRunner(values_queue, enqueue_ops)) + tf.summary.scalar("queue/%s/fraction_of_%d_full" % (values_queue.name, + capacity), + tf.cast(values_queue.size(), tf.float32) * (1.0 / capacity)) + + return values_queue diff --git a/skip_thoughts/pretrained_embeddings.py b/skip_thoughts/pretrained_embeddings.py new file mode 100644 index 0000000..6861e32 --- /dev/null +++ b/skip_thoughts/pretrained_embeddings.py @@ -0,0 +1,97 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Changes by Babylon Partners +# - Modified skip_thoughts/vocabulary_expansion.py to +# prepare pretrained embeddings +# ============================================================================== + + +import os +import collections +import tensorflow as tf +import numpy as np +import gensim + + +FLAGS = tf.flags.FLAGS + +tf.flags.DEFINE_string("vocab_file", None, + "Existing vocab file." + "The file format is a list of newline-separated words, " + "where the word id is the corresponding 0-based index " + "in the file. The 0th word must correspond to the " + "token and the 1st word to the token.") +tf.flags.DEFINE_string("word2vec_model", None, + "File containing a word2vec model in binary format.") +tf.flags.DEFINE_string("output_dir", None, "Output directory.") + +tf.logging.set_verbosity(tf.logging.INFO) + + +def _get_vocabulary(): + """Loads the model vocabulary. + + + Returns: + vocab: A dictionary of word to id. + """ + if not FLAGS.vocab_file: + raise RuntimeError("No vocab file specified.") + + tf.logging.info("Loading existing vocab file.") + vocab = collections.OrderedDict() + with tf.gfile.GFile(FLAGS.vocab_file, mode="r") as f: + for i, line in enumerate(f): + word = line.strip().decode("utf-8") + assert word not in vocab, "Attempting to add word twice: %s" % word + vocab[word] = i + tf.logging.info("Read vocab of size %d from %s", + len(vocab), FLAGS.vocab_file) + return vocab + + +def main(unused_argv): + vocab = _get_vocabulary() + tf.logging.info("Loading word2vec model.") + w2v_model = gensim.models.KeyedVectors.load_word2vec_format( + FLAGS.word2vec_model, binary=True) + tf.logging.info("Loaded word2vec model.") + + vocab_size = len(vocab) + init_embeddings = np.zeros(shape=(vocab_size, w2v_model.vector_size), + dtype=np.float32) + eos_vector = np.zeros(shape=(1, w2v_model.vector_size), + dtype=np.float32) + eos_vector[0][0] = -1 + unk_vector = np.zeros(shape=(1, w2v_model.vector_size), + dtype=np.float32) + unk_vector[0][-1] = -1 + + tf.logging.info("Building embedding matrix.") + for word, idx in vocab.items(): + if word in w2v_model: + init_embeddings[idx] = w2v_model[word] + else: + init_embeddings[idx] = unk_vector + init_embeddings[0] = eos_vector + init_embeddings[1] = unk_vector + embeddings_file = os.path.join(FLAGS.output_dir, "init_embeddings.npy") + if not os.path.exists(FLAGS.output_dir): + os.makedirs(FLAGS.output_dir) + np.save(embeddings_file, init_embeddings) + + +if __name__ == "__main__": + tf.app.run() diff --git a/skip_thoughts/skip_thoughts_encoder.py b/skip_thoughts/skip_thoughts_encoder.py new file mode 100644 index 0000000..89e58ba --- /dev/null +++ b/skip_thoughts/skip_thoughts_encoder.py @@ -0,0 +1,262 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Changes by Babylon Partners +# - Added new encode-decode mode +# ============================================================================== +"""Class for encoding text using a trained SkipThoughtsModel. + +Example usage: + g = tf.Graph() + with g.as_default(): + encoder = SkipThoughtsEncoder(embeddings) + restore_fn = encoder.build_graph_from_config(model_config, checkpoint_path) + + with tf.Session(graph=g) as sess: + restore_fn(sess) + skip_thought_vectors = encoder.encode(sess, data) +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os.path + + +import nltk +import nltk.tokenize +import numpy as np +import tensorflow as tf + +from skip_thoughts import skip_thoughts_model +from skip_thoughts.data import special_words + + +def _pad(seq, target_len): + """Pads a sequence of word embeddings up to the target length. + + Args: + seq: Sequence of word embeddings. + target_len: Desired padded sequence length. + + Returns: + embeddings: Input sequence padded with zero embeddings up to the target + length. + mask: A 0/1 vector with zeros corresponding to padded embeddings. + + Raises: + ValueError: If len(seq) is not in the interval (0, target_len]. + """ + seq_len = len(seq) + if seq_len <= 0 or seq_len > target_len: + raise ValueError("Expected 0 < len(seq) <= %d, got %d" % (target_len, + seq_len)) + + emb_dim = seq[0].shape[0] + padded_seq = np.zeros(shape=(target_len, emb_dim), dtype=seq[0].dtype) + mask = np.zeros(shape=(target_len,), dtype=np.int8) + for i in range(seq_len): + padded_seq[i] = seq[i] + mask[i] = 1 + return padded_seq, mask + + +def _batch_and_pad(sequences): + """Batches and pads sequences of word embeddings into a 2D array. + + Args: + sequences: A list of batch_size sequences of word embeddings. + + Returns: + embeddings: A numpy array with shape [batch_size, padded_length, emb_dim]. + mask: A numpy 0/1 array with shape [batch_size, padded_length] with zeros + corresponding to padded elements. + """ + batch_embeddings = [] + batch_mask = [] + batch_len = max([len(seq) for seq in sequences]) + for seq in sequences: + embeddings, mask = _pad(seq, batch_len) + batch_embeddings.append(embeddings) + batch_mask.append(mask) + return np.array(batch_embeddings), np.array(batch_mask) + + +class SkipThoughtsEncoder(object): + """Skip-thoughts sentence encoder.""" + + def __init__(self, embeddings): + """Initializes the encoder. + + Args: + embeddings: Dictionary of word to embedding vector (1D numpy array). + """ + self._sentence_detector = nltk.data.load("tokenizers/punkt/english.pickle") + self._embeddings = embeddings + + def _create_restore_fn(self, checkpoint_path, saver): + """Creates a function that restores a model from checkpoint. + + Args: + checkpoint_path: Checkpoint file or a directory containing a checkpoint + file. + saver: Saver for restoring variables from the checkpoint file. + + Returns: + restore_fn: A function such that restore_fn(sess) loads model variables + from the checkpoint file. + + Raises: + ValueError: If checkpoint_path does not refer to a checkpoint file or a + directory containing a checkpoint file. + """ + if tf.gfile.IsDirectory(checkpoint_path): + latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path) + if not latest_checkpoint: + raise ValueError("No checkpoint file found in: %s" % checkpoint_path) + checkpoint_path = latest_checkpoint + + def _restore_fn(sess): + tf.logging.info("Loading model from checkpoint: %s", checkpoint_path) + saver.restore(sess, checkpoint_path) + tf.logging.info("Successfully loaded checkpoint: %s", + os.path.basename(checkpoint_path)) + + return _restore_fn + + def build_graph_from_config(self, model_config, checkpoint_path, + mode="encode"): + """Builds the inference graph from a configuration object. + + Args: + model_config: Object containing configuration for building the model. + checkpoint_path: Checkpoint file or a directory containing a checkpoint + file. + + Returns: + restore_fn: A function such that restore_fn(sess) loads model variables + from the checkpoint file. + """ + tf.logging.info("Building model.") + model = skip_thoughts_model.SkipThoughtsModel(model_config, mode=mode) + model.build() + saver = tf.train.Saver() + + return self._create_restore_fn(checkpoint_path, saver) + + def build_graph_from_proto(self, graph_def_file, saver_def_file, + checkpoint_path): + """Builds the inference graph from serialized GraphDef and SaverDef protos. + + Args: + graph_def_file: File containing a serialized GraphDef proto. + saver_def_file: File containing a serialized SaverDef proto. + checkpoint_path: Checkpoint file or a directory containing a checkpoint + file. + + Returns: + restore_fn: A function such that restore_fn(sess) loads model variables + from the checkpoint file. + """ + # Load the Graph. + tf.logging.info("Loading GraphDef from file: %s", graph_def_file) + graph_def = tf.GraphDef() + with tf.gfile.FastGFile(graph_def_file, "rb") as f: + graph_def.ParseFromString(f.read()) + tf.import_graph_def(graph_def, name="") + + # Load the Saver. + tf.logging.info("Loading SaverDef from file: %s", saver_def_file) + saver_def = tf.train.SaverDef() + with tf.gfile.FastGFile(saver_def_file, "rb") as f: + saver_def.ParseFromString(f.read()) + saver = tf.train.Saver(saver_def=saver_def) + + return self._create_restore_fn(checkpoint_path, saver) + + def _tokenize(self, item): + """Tokenizes an input string into a list of words.""" + tokenized = [] + for s in self._sentence_detector.tokenize(item): + tokenized.extend(nltk.tokenize.word_tokenize(s)) + + return tokenized + + def _word_to_embedding(self, w): + """Returns the embedding of a word.""" + return self._embeddings.get(w, self._embeddings[special_words.UNK]) + + def _preprocess(self, data, use_eos): + """Preprocesses text for the encoder. + + Args: + data: A list of input strings. + use_eos: Whether to append the end-of-sentence word to each sentence. + + Returns: + embeddings: A list of word embedding sequences corresponding to the input + strings. + """ + preprocessed_data = [] + for item in data: + tokenized = self._tokenize(item) + if use_eos: + tokenized.append(special_words.EOS) + preprocessed_data.append([self._word_to_embedding(w) for w in tokenized]) + return preprocessed_data + + def encode(self, + sess, + data, + use_norm=True, + verbose=True, + batch_size=128, + use_eos=False): + """Encodes a sequence of sentences as skip-thought vectors. + + Args: + sess: TensorFlow Session. + data: A list of input strings. + use_norm: Whether to normalize skip-thought vectors to unit L2 norm. + verbose: Whether to log every batch. + batch_size: Batch size for the encoder. + use_eos: Whether to append the end-of-sentence word to each input + sentence. + + Returns: + thought_vectors: A list of numpy arrays corresponding to the skip-thought + encodings of sentences in 'data'. + """ + data = self._preprocess(data, use_eos) + thought_vectors = [] + + batch_indices = np.arange(0, len(data), batch_size) + for batch, start_index in enumerate(batch_indices): + if verbose: + tf.logging.info("Batch %d / %d.", batch, len(batch_indices)) + + embeddings, mask = _batch_and_pad( + data[start_index:start_index + batch_size]) + feed_dict = { + "encode_emb:0": embeddings, + "encode_mask:0": mask, + } + thought_vectors.extend( + sess.run("encoder/thought_vectors:0", feed_dict=feed_dict)) + + if use_norm: + thought_vectors = [v / np.linalg.norm(v) for v in thought_vectors] + + return thought_vectors diff --git a/skip_thoughts/skip_thoughts_model.py b/skip_thoughts/skip_thoughts_model.py new file mode 100644 index 0000000..a635f1f --- /dev/null +++ b/skip_thoughts/skip_thoughts_model.py @@ -0,0 +1,559 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Changes by Babylon Partners +# - Added code to support sequence and skipgram +# decoders for previous, current, and next sentences +# - Added _build_skipgram_decoder function +# - Added encode-decode mode +# - Added code to support decoder unrolling +# - Added summaries +# ============================================================================== +"""Skip-Thoughts model for learning sentence vectors. + +The model is based on the paper: + + "Skip-Thought Vectors" + Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, + Antonio Torralba, Raquel Urtasun, Sanja Fidler. + https://papers.nips.cc/paper/5950-skip-thought-vectors.pdf + +Layer normalization is applied based on the paper: + + "Layer Normalization" + Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton + https://arxiv.org/abs/1607.06450 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import tensorflow as tf +import numpy as np +from skip_thoughts.ops import gru_cell +from skip_thoughts.ops import input_ops + +import skip_thoughts.summaries as summaries + + +def random_orthonormal_initializer(shape, dtype=tf.float32, + partition_info=None): + # pylint: disable=unused-argument + """Variable initializer that produces a random orthonormal matrix.""" + if len(shape) != 2 or shape[0] != shape[1]: + raise ValueError("Expecting square shape, got %s" % shape) + _, u, _ = tf.svd(tf.random_normal(shape, dtype=dtype), full_matrices=True) + return u + + +class SkipThoughtsModel(object): + """Skip-thoughts model.""" + + def __init__(self, config, mode="train", input_reader=None): + """Basic setup. The actual TensorFlow graph is constructed in build(). + + Args: + config: Object containing configuration parameters. + mode: "train", "eval" or "encode". + input_reader: Subclass of tf.ReaderBase for reading the input + serialized tf.Example protocol buffers. Defaults to + TFRecordReader. + + Raises: + ValueError: If mode is invalid. + """ + if mode not in ["train", "eval", "encode", "encode-decode"]: + raise ValueError("Unrecognized mode: %s" % mode) + + self.config = config + self.mode = mode + self.reader = input_reader if input_reader else tf.TFRecordReader() + + # Initializer used for non-recurrent weights. + self.uniform_initializer = tf.random_uniform_initializer( + minval=-self.config.uniform_init_scale, + maxval=self.config.uniform_init_scale) + + # Input sentences represented as sequences of word ids. + # "encode" is the source sentence, + # "decode_pre" is the previous sentence and + # "decode_post" is the next sentence. + # Each is an int64 Tensor with shape [batch_size, padded_length]. + self.encode_ids = None + self.decode_pre_ids = None + self.decode_post_ids = None + + # Boolean masks distinguishing real words (1) from padded words (0). + # Each is an int32 Tensor with shape [batch_size, padded_length]. + self.encode_mask = None + self.decode_pre_mask = None + self.decode_post_mask = None + + # Input sentences represented as sequences of word embeddings. + # Each is a float32 Tensor with shape + # [batch_size, padded_length, emb_dim]. + self.encode_emb = None + self.decode_pre_emb = None + self.decode_post_emb = None + + # The output from the sentence encoder. + # A float32 Tensor with shape [batch_size, num_gru_units]. + self.thought_vectors = None + + # The cross entropy losses and corresponding weights of the decoders. + # Used for evaluation. + self.target_cross_entropy_losses = [] + self.target_cross_entropy_loss_weights = [] + + # The total loss to optimize. + self.total_loss = None + + tvars = tf.trainable_variables() + with tf.name_scope('trainable_vars'): + # Summarise all variables + for var in tvars: + summaries.variable_summaries(var, summary_prefix=var.name) + + def build_inputs(self): + """Builds the ops for reading input data. + + Outputs: + self.encode_ids + self.decode_pre_ids + self.decode_post_ids + self.encode_mask + self.decode_pre_mask + self.decode_post_mask + """ + if self.mode == "encode": + # Word embeddings are fed from an external vocabulary which has + # possibly been expanded (see vocabulary_expansion.py). + encode_ids = None + decode_pre_ids = None + decode_post_ids = None + + encode_mask = tf.placeholder( + tf.int8, (None, None), name="encode_mask") + decode_pre_mask = None + decode_post_mask = None + elif self.mode == "encode-decode": + # Word embeddings are fed from an external vocabulary which has + # possibly been expanded (see vocabulary_expansion.py). + encode_ids = None + decode_pre_ids = None + decode_post_ids = None + + encode_mask = tf.placeholder( + tf.int8, (None, None), name="encode_mask") + decode_pre_mask = tf.placeholder( + tf.int8, (None, None), name="decode_pre_mask") + decode_post_mask = tf.placeholder( + tf.int8, (None, None), name="decode_post_mask") + else: + # Prefetch serialized tf.Example protos. + input_queue = input_ops.prefetch_input_data( + self.reader, + self.config.input_file_pattern, + shuffle=self.config.shuffle_input_data, + capacity=self.config.input_queue_capacity, + num_reader_threads=self.config.num_input_reader_threads) + + # Deserialize a batch. + serialized = input_queue.dequeue_many(self.config.batch_size) + encode, decode_pre, decode_post = input_ops.parse_example_batch( + serialized) + + encode_ids = encode.ids + decode_pre_ids = decode_pre.ids + decode_post_ids = decode_post.ids + + encode_mask = encode.mask + decode_pre_mask = decode_pre.mask + decode_post_mask = decode_post.mask + + self.encode_ids = encode_ids + self.decode_pre_ids = decode_pre_ids + self.decode_post_ids = decode_post_ids + + self.encode_mask = encode_mask + self.decode_pre_mask = decode_pre_mask + self.decode_post_mask = decode_post_mask + + def build_word_embeddings(self): + """Builds the word embeddings. + + Inputs: + self.encode_ids + self.decode_pre_ids + self.decode_post_ids + + Outputs: + self.encode_emb + self.decode_pre_emb + self.decode_post_emb + """ + if self.mode == "encode": + # Word embeddings are fed from an external vocabulary which has + # possibly been expanded (see vocabulary_expansion.py). + encode_emb = tf.placeholder(tf.float32, ( + None, None, self.config.word_embedding_dim), "encode_emb") + # No sequences to decode. + decode_pre_emb = None + decode_post_emb = None + elif self.mode == "encode-decode": + # Word embeddings are fed from an external vocabulary which has + # possibly been expanded (see vocabulary_expansion.py). + encode_emb = tf.placeholder(tf.float32, ( + None, None, self.config.word_embedding_dim), "encode_emb") + decode_pre_emb = tf.placeholder(tf.float32, ( + None, None, self.config.word_embedding_dim), "decode_pre_emb") + decode_post_emb = tf.placeholder(tf.float32, ( + None, None, self.config.word_embedding_dim), "decode_post_emb") + + word_emb = tf.get_variable( + name="word_embedding", + shape=[self.config.vocab_size, self.config.word_embedding_dim], + initializer=self.uniform_initializer) + else: + if self.config.pretrained_word_emb_file: + word_emb_values = np.load(self.config.pretrained_word_emb_file) + assert self.config.word_embedding_dim == word_emb_values.shape[1] + + word_emb = tf.get_variable( + name="word_embedding", + initializer=tf.constant(word_emb_values, dtype=tf.float32), + trainable=self.config.word_emb_trainable) + else: + word_emb = tf.get_variable( + name="word_embedding", + shape=[self.config.vocab_size, self.config.word_embedding_dim], + initializer=self.uniform_initializer) + + encode_emb = tf.nn.embedding_lookup(word_emb, self.encode_ids) + decode_pre_emb = tf.nn.embedding_lookup(word_emb, self.decode_pre_ids) + decode_post_emb = tf.nn.embedding_lookup(word_emb, self.decode_post_ids) + + self.encode_emb = encode_emb + self.decode_pre_emb = decode_pre_emb + self.decode_post_emb = decode_post_emb + + def _initialize_gru_cell(self, num_units): + """Initializes a GRU cell. + + The Variables of the GRU cell are initialized in a way that exactly matches + the skip-thoughts paper: recurrent weights are initialized from random + orthonormal matrices and non-recurrent weights are initialized from random + uniform matrices. + + Args: + num_units: Number of output units. + + Returns: + cell: An instance of RNNCell with variable initializers that match the + skip-thoughts paper. + """ + return gru_cell.LayerNormGRUCell( + num_units, + w_initializer=self.uniform_initializer, + u_initializer=random_orthonormal_initializer, + b_initializer=tf.constant_initializer(0.0)) + + def build_encoder(self): + """Builds the sentence encoder. + + Inputs: + self.encode_emb + self.encode_mask + + Outputs: + self.thought_vectors + + Raises: + ValueError: if config.bidirectional_encoder is True and config.encoder_dim + is odd. + """ + with tf.variable_scope("encoder") as scope: + if self.config.skipgram_encoder: + w_dim = self.config.word_embedding_dim + e_dim = self.config.encoder_dim + + # For FastSent sentence emb dim is the same word emb dim + assert w_dim == e_dim + + encode_emb = tf.reshape(self.encode_emb, [-1, w_dim]) + weights = tf.to_float(tf.reshape(self.encode_mask, [-1, 1])) + encode_emb = encode_emb * weights + seq_len = tf.shape(self.encode_mask)[-1] + encode_emb = tf.reshape(encode_emb, tf.stack([-1, seq_len, w_dim])) + + self.thought_vectors = tf.reduce_sum(encode_emb, + axis=1, + name="thought_vectors") + return + + length = tf.to_int32(tf.reduce_sum(self.encode_mask, 1), name="length") + + if self.config.bidirectional_encoder: + if self.config.encoder_dim % 2: + raise ValueError( + "encoder_dim must be even when using a bidirectional encoder.") + num_units = self.config.encoder_dim // 2 + cell_fw = self._initialize_gru_cell(num_units) # Forward encoder + cell_bw = self._initialize_gru_cell(num_units) # Backward encoder + _, states = tf.nn.bidirectional_dynamic_rnn( + cell_fw=cell_fw, + cell_bw=cell_bw, + inputs=self.encode_emb, + sequence_length=length, + dtype=tf.float32, + scope=scope) + thought_vectors = tf.concat(states, 1, name="thought_vectors") + else: + cell = self._initialize_gru_cell(self.config.encoder_dim) + _, state = tf.nn.dynamic_rnn( + cell=cell, + inputs=self.encode_emb, + sequence_length=length, + dtype=tf.float32, + scope=scope) + # Use an identity operation to name the Tensor in the Graph. + thought_vectors = tf.identity(state, name="thought_vectors") + + self.thought_vectors = thought_vectors + + def _build_sequence_decoder(self, name, embeddings, targets, mask, + initial_state, reuse_logits): + """Builds a sentence decoder. + + Args: + name: Decoder name. + embeddings: Batch of sentences to decode; a float32 Tensor with shape + [batch_size, padded_length, emb_dim]. + targets: Batch of target word ids; an int64 Tensor with shape + [batch_size, padded_length]. + mask: A 0/1 Tensor with shape [batch_size, padded_length]. + initial_state: Initial state of the GRU. A float32 Tensor with shape + [batch_size, num_gru_cells]. + reuse_logits: Whether to reuse the logits weights. + """ + # Decoder RNN. + cell = self._initialize_gru_cell(self.config.encoder_dim) + with tf.variable_scope(name) as scope: + # Add a padding word at the start of each sentence (to correspond to the + # prediction of the first word) and remove the last word. + decoder_input = tf.pad( + embeddings[:, :-1, :], [[0, 0], [1, 0], [0, 0]], name="input") + length = tf.reduce_sum(mask, 1, name="length") + decoder_output, decoder_state = tf.nn.dynamic_rnn( + cell=cell, + inputs=decoder_input, + sequence_length=length, + initial_state=initial_state, + scope=scope) + + decoder_output = tf.identity(decoder_output, name='decoder_output') + decoder_state = tf.identity(decoder_state, name='decoder_state') + + # Stack batch vertically. + decoder_output = tf.reshape(decoder_output, [-1, self.config.encoder_dim]) + + # Logits. + with tf.variable_scope("logits", reuse=reuse_logits) as scope: + logits = tf.contrib.layers.fully_connected( + inputs=decoder_output, + num_outputs=self.config.vocab_size, + activation_fn=None, + weights_initializer=self.uniform_initializer, + scope=scope) + + logits = tf.identity(logits, name='logits/' + name) + + # If we just want the encode-decode, stop here + if self.mode == "encode-decode": + return None + + targets = tf.reshape(targets, [-1]) + weights = tf.to_float(tf.reshape(mask, [-1])) + + losses = tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=targets, logits=logits) + batch_loss = tf.reduce_sum(losses * weights) + + if self.config.normalise_decoder_losses: + batch_loss = batch_loss / self.config.num_sequence_decoders + + batch_loss = batch_loss * self.config.sequence_prefactor + + tf.losses.add_loss(batch_loss) + + tf.summary.scalar("losses/" + name, batch_loss) + + self.target_cross_entropy_losses.append(losses) + self.target_cross_entropy_loss_weights.append(weights) + + def _build_skipgram_decoder(self, name, targets, mask, reuse_logits): + + """Builds a skipgram decoder. + + Args: + name: Decoder name. + targets: Batch of target word ids; an int64 Tensor with shape + [batch_size, padded_length]. + mask: A 0/1 Tensor with shape [batch_size, padded_length]. + reuse_logits: Whether to reuse the logits weights. + """ + + with tf.variable_scope("skipgram_logits", reuse=reuse_logits) as scope: + skipgram_logits = tf.contrib.layers.fully_connected( + inputs=self.thought_vectors, + num_outputs=self.config.vocab_size, + activation_fn=None, + weights_initializer=self.uniform_initializer, + scope=scope) + + if self.mode == "encode-decode": + return None + + multiples = tf.stack([1, tf.shape(targets)[-1], 1]) + skipgram_logits = tf.expand_dims(skipgram_logits, 1) + skipgram_logits = tf.tile(skipgram_logits, multiples) + skipgram_logits = tf.reshape(skipgram_logits, [-1, self.config.vocab_size]) + + targets = tf.reshape(targets, [-1]) + weights = tf.to_float(tf.reshape(mask, [-1])) + + losses = tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=targets, logits=skipgram_logits) + + batch_loss = tf.reduce_sum(losses * weights) + + if self.config.normalise_decoder_losses: + batch_loss = batch_loss / self.config.num_skipgram_decoders + + batch_loss = batch_loss * self.config.skipgram_prefactor + + tf.losses.add_loss(batch_loss) + + tf.summary.scalar("losses/" + name, batch_loss) + + self.target_cross_entropy_losses.append(losses) + self.target_cross_entropy_loss_weights.append(weights) + + def build_sequence_decoders(self): + """Builds the sentence decoders. + + Inputs: + self.decode_pre_emb + self.decode_post_emb + self.decode_pre_ids + self.decode_post_ids + self.decode_pre_mask + self.decode_post_mask + self.thought_vectors + + Outputs: + self.target_cross_entropy_losses + self.target_cross_entropy_loss_weights + """ + reuse_logits = False + if self.config.sequence_decoder_pre: + # Pre-sentence decoder. + self._build_sequence_decoder("decoder_pre", self.decode_pre_emb, + self.decode_pre_ids, self.decode_pre_mask, + self.thought_vectors, reuse_logits) + reuse_logits = True + + if self.config.sequence_decoder_post: + # Post-sentence decoder. Logits weights are reused. + self._build_sequence_decoder("decoder_post", self.decode_post_emb, + self.decode_post_ids, self.decode_post_mask, + self.thought_vectors, reuse_logits) + reuse_logits = True + + if self.config.sequence_decoder_cur: + # Cur-sentence decoder. Logits weights are reused. + self._build_sequence_decoder("decoder_cur", self.encode_emb, + self.encode_ids, self.encode_mask, + self.thought_vectors, reuse_logits) + + def build_skipgram_decoders(self): + """Builds the sentence decoders. + + Inputs: + self.decode_pre_ids + self.decode_post_ids + self.decode_pre_mask + self.decode_post_mask + self.thought_vectors + + Outputs: + self.target_cross_entropy_losses + self.target_cross_entropy_loss_weights + """ + reuse_logits = False + if self.config.skipgram_decoder_pre: + # Pre-sentence decoder. + self._build_skipgram_decoder("skipgram_pre", + self.decode_pre_ids, self.decode_pre_mask, + reuse_logits) + reuse_logits = True + + if self.config.skipgram_decoder_post: + # Post-sentence decoder. Logits weights are reused. + self._build_skipgram_decoder("skipgram_post", + self.decode_post_ids, self.decode_post_mask, + reuse_logits) + reuse_logits = True + + if self.config.skipgram_decoder_cur: + # Cur-sentence decoder. Logits weights are reused. + self._build_skipgram_decoder("skipgram_cur", + self.encode_ids, self.encode_mask, + reuse_logits) + + def build_decoders(self): + if self.mode != "encode": + self.build_sequence_decoders() + self.build_skipgram_decoders() + + def build_loss(self): + """Builds the loss Tensor. + + Outputs: + self.total_loss + """ + if self.mode not in ["encode", 'encode-decode']: + total_loss = tf.losses.get_total_loss() + tf.summary.scalar("losses/total", total_loss) + + self.total_loss = total_loss + + def build_global_step(self): + """Builds the global step Tensor. + + Outputs: + self.global_step + """ + self.global_step = tf.contrib.framework.create_global_step() + + def build(self): + """Creates all ops for training, evaluation or encoding.""" + self.build_inputs() + self.build_word_embeddings() + self.build_encoder() + self.build_decoders() + self.build_loss() + self.build_global_step() diff --git a/skip_thoughts/summaries.py b/skip_thoughts/summaries.py new file mode 100644 index 0000000..35be983 --- /dev/null +++ b/skip_thoughts/summaries.py @@ -0,0 +1,34 @@ +# Copyright 2018 Babylon Partners. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + + +def variable_summaries(var, summary_prefix): + """Attach a lot of summaries to a Tensor + (for TensorBoard visualization).""" + mean = tf.reduce_mean(var) + tf.summary.scalar('{sp} mean'.format( + sp=summary_prefix), mean) + with tf.name_scope('stddev'): + stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) + tf.summary.scalar('{sp} stddev'.format( + sp=summary_prefix), stddev) + tf.summary.scalar('{sp} max'.format( + sp=summary_prefix), tf.reduce_max(var)) + tf.summary.scalar('{sp} min'.format( + sp=summary_prefix), tf.reduce_min(var)) + tf.summary.histogram('{sp} histogram'.format( + sp=summary_prefix), var) diff --git a/skip_thoughts/track_perplexity.py b/skip_thoughts/track_perplexity.py new file mode 100644 index 0000000..05d0e33 --- /dev/null +++ b/skip_thoughts/track_perplexity.py @@ -0,0 +1,199 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tracks training progress via per-word perplexity. + +This script should be run concurrently with training so that summaries show up +in TensorBoard. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os.path +import time + + +import numpy as np +import tensorflow as tf + +from skip_thoughts import configuration +from skip_thoughts import skip_thoughts_model + +FLAGS = tf.flags.FLAGS + +tf.flags.DEFINE_string("input_file_pattern", None, + "File pattern of sharded TFRecord input files.") +tf.flags.DEFINE_string("checkpoint_dir", None, + "Directory containing model checkpoints.") +tf.flags.DEFINE_string("eval_dir", None, "Directory to write event logs to.") + +tf.flags.DEFINE_integer("eval_interval_secs", 600, + "Interval between evaluation runs.") +tf.flags.DEFINE_integer("num_eval_examples", 50000, + "Number of examples for evaluation.") + +tf.flags.DEFINE_integer("min_global_step", 100, + "Minimum global step to run evaluation.") + +tf.logging.set_verbosity(tf.logging.INFO) + + +def evaluate_model(sess, losses, weights, num_batches, global_step, + summary_writer, summary_op): + """Computes perplexity-per-word over the evaluation dataset. + + Summaries and perplexity-per-word are written out to the eval directory. + + Args: + sess: Session object. + losses: A Tensor of any shape; the target cross entropy losses for the + current batch. + weights: A Tensor of weights corresponding to losses. + num_batches: Integer; the number of evaluation batches. + global_step: Integer; global step of the model checkpoint. + summary_writer: Instance of SummaryWriter. + summary_op: Op for generating model summaries. + """ + # Log model summaries on a single batch. + summary_str = sess.run(summary_op) + summary_writer.add_summary(summary_str, global_step) + + start_time = time.time() + sum_losses = 0.0 + sum_weights = 0.0 + for i in xrange(num_batches): + batch_losses, batch_weights = sess.run([losses, weights]) + sum_losses += np.sum(batch_losses * batch_weights) + sum_weights += np.sum(batch_weights) + if not i % 100: + tf.logging.info("Computed losses for %d of %d batches.", i + 1, + num_batches) + eval_time = time.time() - start_time + + perplexity = math.exp(sum_losses / sum_weights) + tf.logging.info("Perplexity = %f (%.2f sec)", perplexity, eval_time) + + # Log perplexity to the SummaryWriter. + summary = tf.Summary() + value = summary.value.add() + value.simple_value = perplexity + value.tag = "perplexity" + summary_writer.add_summary(summary, global_step) + + # Write the Events file to the eval directory. + summary_writer.flush() + tf.logging.info("Finished processing evaluation at global step %d.", + global_step) + + +def run_once(model, losses, weights, saver, summary_writer, summary_op): + """Evaluates the latest model checkpoint. + + Args: + model: Instance of SkipThoughtsModel; the model to evaluate. + losses: Tensor; the target cross entropy losses for the current batch. + weights: A Tensor of weights corresponding to losses. + saver: Instance of tf.train.Saver for restoring model Variables. + summary_writer: Instance of FileWriter. + summary_op: Op for generating model summaries. + """ + model_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) + if not model_path: + tf.logging.info("Skipping evaluation. No checkpoint found in: %s", + FLAGS.checkpoint_dir) + return + + with tf.Session() as sess: + # Load model from checkpoint. + tf.logging.info("Loading model from checkpoint: %s", model_path) + saver.restore(sess, model_path) + global_step = tf.train.global_step(sess, model.global_step.name) + tf.logging.info("Successfully loaded %s at global step = %d.", + os.path.basename(model_path), global_step) + if global_step < FLAGS.min_global_step: + tf.logging.info("Skipping evaluation. Global step = %d < %d", global_step, + FLAGS.min_global_step) + return + + # Start the queue runners. + coord = tf.train.Coordinator() + threads = tf.train.start_queue_runners(coord=coord) + + num_eval_batches = int( + math.ceil(FLAGS.num_eval_examples / model.config.batch_size)) + + # Run evaluation on the latest checkpoint. + try: + evaluate_model(sess, losses, weights, num_eval_batches, global_step, + summary_writer, summary_op) + except tf.InvalidArgumentError: + tf.logging.error( + "Evaluation raised InvalidArgumentError (e.g. due to Nans).") + finally: + coord.request_stop() + coord.join(threads, stop_grace_period_secs=10) + + +def main(unused_argv): + if not FLAGS.input_file_pattern: + raise ValueError("--input_file_pattern is required.") + if not FLAGS.checkpoint_dir: + raise ValueError("--checkpoint_dir is required.") + if not FLAGS.eval_dir: + raise ValueError("--eval_dir is required.") + + # Create the evaluation directory if it doesn't exist. + eval_dir = FLAGS.eval_dir + if not tf.gfile.IsDirectory(eval_dir): + tf.logging.info("Creating eval directory: %s", eval_dir) + tf.gfile.MakeDirs(eval_dir) + + g = tf.Graph() + with g.as_default(): + # Build the model for evaluation. + model_config = configuration.model_config( + input_file_pattern=FLAGS.input_file_pattern, + input_queue_capacity=FLAGS.num_eval_examples, + shuffle_input_data=False) + model = skip_thoughts_model.SkipThoughtsModel(model_config, mode="eval") + model.build() + + losses = tf.concat(model.target_cross_entropy_losses, 0) + weights = tf.concat(model.target_cross_entropy_loss_weights, 0) + + # Create the Saver to restore model Variables. + saver = tf.train.Saver() + + # Create the summary operation and the summary writer. + summary_op = tf.summary.merge_all() + summary_writer = tf.summary.FileWriter(eval_dir) + + g.finalize() + + # Run a new evaluation run every eval_interval_secs. + while True: + start = time.time() + tf.logging.info("Starting evaluation at " + time.strftime( + "%Y-%m-%d-%H:%M:%S", time.localtime())) + run_once(model, losses, weights, saver, summary_writer, summary_op) + time_to_next_eval = start + FLAGS.eval_interval_secs - time.time() + if time_to_next_eval > 0: + time.sleep(time_to_next_eval) + + +if __name__ == "__main__": + tf.app.run() diff --git a/skip_thoughts/train.py b/skip_thoughts/train.py new file mode 100644 index 0000000..e7c18d2 --- /dev/null +++ b/skip_thoughts/train.py @@ -0,0 +1,245 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Changes by Babylon Partners +# - Added +# --run_dir +# --pretrained_word_emb_file +# --word_emb_trainable +# --skipgram_encoder +# --decoder +# --normalise_decoder_losses +# --skipgram_prefactor +# --sequence_prefactor +# ============================================================================== +"""Train the skip-thoughts model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cPickle +import json +import os +import time + +import tensorflow as tf + +from skip_thoughts import configuration +from skip_thoughts import experiments +from skip_thoughts import skip_thoughts_model + +FLAGS = tf.flags.FLAGS + +# Data +tf.flags.DEFINE_string("input_file_pattern", None, + "File pattern of sharded TFRecord files containing " + "tf.Example protos.") + +# Training dir +tf.flags.DEFINE_string("run_dir", None, + "Directory where all of the runs are.") +tf.flags.DEFINE_string("train_dir", None, + "Directory for training. Overwrites autogenerated " + "train_dir.") + +# Vocabulary config +tf.flags.DEFINE_integer("vocab_size", 20000, + "Size of the vocabulary") + + +# Word embedding config +tf.flags.DEFINE_integer("word_dim", 620, + "Dimensionality of the word embeddings") +tf.flags.DEFINE_string("pretrained_word_emb_file", None, + "File containing pre-trained word embeddings," + "such as word2vec") +tf.flags.DEFINE_bool("word_emb_trainable", False, + "Whether pre-trained word embeddings are" + "jointly trainable with the model.") + +# Encoder config +tf.flags.DEFINE_integer("encoder_dim", 2400, + "The number of units to use in encoder and decoder" + "rnn cells.") +tf.flags.DEFINE_bool("skipgram_encoder", False, + "Whether to use a skipgram encoder (sum of embeddings)" + "instead of sequence encoder (RNN)") + +# Decoder config +tf.flags.DEFINE_string("decoder", None, + "Decoder specification in SEQxSKGy format, " + "where x, y can be 0, 1, 2, and 3." + "SEQ stands for sequence (recurrent) decoder " + "and SKG stands for bag-of-words (BOW) decoder." + "0 - no decoder of this type is present" + "1 - decoder for the current sentence (Autoencoder)" + "2 - decoders for the previous and next sentences " + "(Skip-Though/FastSent style)" + "3 - decoders for previous, current, and next sentences" + "(Skip-Thought + Autoencoder)" + "Note that it is possible to combine SEQ and SKG") +tf.flags.DEFINE_bool("share_weights_logits", True, + "Whether to tie the weights in the output layer of the " + "decoder") +tf.flags.DEFINE_bool("normalise_decoder_losses", False, + "Whether to normalise the losses of the decoders. If " + "True, this divides each sequence loss by the number" + "of sequence decoders, and the skipgram " + "decoder losses by the number of skipgram decoders.") +tf.flags.DEFINE_float("skipgram_prefactor", 1., + "Constant to multiply each skipgram loss with.") +tf.flags.DEFINE_float("sequence_prefactor", 1., + "Constant to multiply each SEQ loss with.") + +# Training config +tf.flags.DEFINE_integer("number_of_steps", 500000, + "The number of steps to take.") +tf.flags.DEFINE_float("gpu_fraction", 1.0, + "What fraction of the gpu to use") +tf.flags.DEFINE_integer("batch_size", 128, + "Batch size") + +tf.logging.set_verbosity(tf.logging.INFO) + + +def _setup_learning_rate(config, global_step): + """Sets up the learning rate with optional exponential decay. + + Args: + config: Object containing learning rate configuration parameters. + global_step: Tensor; the global step. + + Returns: + learning_rate: Tensor; the learning rate with exponential decay. + """ + if config.learning_rate_decay_factor > 0: + learning_rate = tf.train.exponential_decay( + learning_rate=float(config.learning_rate), + global_step=global_step, + decay_steps=config.learning_rate_decay_steps, + decay_rate=config.learning_rate_decay_factor, + staircase=False) + else: + learning_rate = tf.constant(config.learning_rate) + return learning_rate + + +def write_config(train_dir, flags): + flags_path = os.path.join(train_dir, 'flags.pkl') + configs_pkl_path = os.path.join(train_dir, 'config.pkl') + configs_pkl_json = os.path.join(train_dir, 'config.json') + + tf.logging.info("Writing out flags to {p}.".format( + p=flags_path)) + with open(flags_path, 'w') as f: + cPickle.dump(flags, f) + + tf.logging.info("Writing out config dict to {p}.".format( + p=configs_pkl_path)) + with open(configs_pkl_path, 'w') as f: + cPickle.dump(flags.__flags, f) + + tf.logging.info("Writing out config json to {p}.".format( + p=configs_pkl_json)) + with open(configs_pkl_json, 'w') as f: + json.dump(flags.__flags, f) + + return -1 + + +def main(unused_argv): + if not FLAGS.input_file_pattern: + raise ValueError("--input_file_pattern is required.") + if not FLAGS.run_dir: + raise ValueError("--run_dir is required.") + if not FLAGS.decoder: + raise ValueError("--decoder is required.") + + if not FLAGS.train_dir: + train_dir = os.path.join( + FLAGS.run_dir, 'run_{t}'.format(t=time.time())) + tf.logging.info("No specified --train_dir. Creating {d}.".format( + d=train_dir)) + os.makedirs(train_dir) + + write_config(train_dir=train_dir, flags=FLAGS) + + else: + tf.logging.info("Specified --train_dir {d}; Not autocreating.".format( + d=FLAGS.train_dir)) + train_dir = FLAGS.train_dir + + decoder_config = experiments.get_decoder_config(flags=FLAGS) + model_config = configuration.model_config( + input_file_pattern=FLAGS.input_file_pattern, + vocab_size=FLAGS.vocab_size, + batch_size=FLAGS.batch_size, + word_embedding_dim=FLAGS.word_dim, + pretrained_word_emb_file=FLAGS.pretrained_word_emb_file, + word_emb_trainable=FLAGS.word_emb_trainable, + encoder_dim=FLAGS.encoder_dim, + skipgram_encoder=FLAGS.skipgram_encoder, + sequence_decoder_pre=decoder_config.sequence_decoder_pre, + sequence_decoder_cur=decoder_config.sequence_decoder_cur, + sequence_decoder_post=decoder_config.sequence_decoder_post, + skipgram_decoder_pre=decoder_config.skipgram_decoder_pre, + skipgram_decoder_cur=decoder_config.skipgram_decoder_cur, + skipgram_decoder_post=decoder_config.skipgram_decoder_post, + share_weights_logits=FLAGS.share_weights_logits, + normalise_decoder_losses=FLAGS.normalise_decoder_losses, + skipgram_prefactor=FLAGS.skipgram_prefactor, + sequence_prefactor=FLAGS.sequence_prefactor) + training_config = configuration.training_config( + number_of_steps=FLAGS.number_of_steps) + + tf.logging.info("Building training graph.") + g = tf.Graph() + with g.as_default(): + tf.set_random_seed(1234) + model = skip_thoughts_model.SkipThoughtsModel( + model_config, mode="train") + model.build() + + learning_rate = _setup_learning_rate( + training_config, model.global_step) + optimizer = tf.train.AdamOptimizer(learning_rate) + + train_tensor = tf.contrib.slim.learning.create_train_op( + total_loss=model.total_loss, + optimizer=optimizer, + global_step=model.global_step, + clip_gradient_norm=training_config.clip_gradient_norm, + summarize_gradients=True, + check_numerics=True) + + saver = tf.train.Saver() + + gpu_options = tf.GPUOptions( + per_process_gpu_memory_fraction=FLAGS.gpu_fraction) + + tf.contrib.slim.learning.train( + train_op=train_tensor, + logdir=train_dir, + graph=g, + global_step=model.global_step, + number_of_steps=training_config.number_of_steps, + session_config=tf.ConfigProto(gpu_options=gpu_options), + save_summaries_secs=training_config.save_summaries_secs, + saver=saver, + save_interval_secs=training_config.save_model_secs) + + +if __name__ == "__main__": + tf.app.run() diff --git a/skip_thoughts/vocabulary_expansion.py b/skip_thoughts/vocabulary_expansion.py new file mode 100644 index 0000000..a215431 --- /dev/null +++ b/skip_thoughts/vocabulary_expansion.py @@ -0,0 +1,203 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Compute an expanded vocabulary of embeddings using a word2vec model. + +This script loads the word embeddings from a trained skip-thoughts model and +from a trained word2vec model (typically with a larger vocabulary). It trains a +linear regression model without regularization to learn a linear mapping from +the word2vec embedding space to the skip-thoughts embedding space. The model is +then applied to all words in the word2vec vocabulary, yielding vectors in the +skip-thoughts word embedding space for the union of the two vocabularies. + +The linear regression task is to learn a parameter matrix W to minimize + || X - Y * W ||^2, +where X is a matrix of skip-thoughts embeddings of shape [num_words, dim1], +Y is a matrix of word2vec embeddings of shape [num_words, dim2], and W is a +matrix of shape [dim2, dim1]. + +This is based on the "Translation Matrix" method from the paper: + + "Exploiting Similarities among Languages for Machine Translation" + Tomas Mikolov, Quoc V. Le, Ilya Sutskever + https://arxiv.org/abs/1309.4168 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os.path + + +import gensim.models +import numpy as np +import sklearn.linear_model +import tensorflow as tf + +FLAGS = tf.flags.FLAGS + +tf.flags.DEFINE_string("skip_thoughts_model", None, + "Checkpoint file or directory containing a checkpoint " + "file.") + +tf.flags.DEFINE_string("skip_thoughts_vocab", None, + "Path to vocabulary file containing a list of newline-" + "separated words where the word id is the " + "corresponding 0-based index in the file.") + +tf.flags.DEFINE_string("word2vec_model", None, + "File containing a word2vec model in binary format.") + +tf.flags.DEFINE_string("output_dir", None, "Output directory.") + +tf.logging.set_verbosity(tf.logging.INFO) + + +def _load_skip_thoughts_embeddings(checkpoint_path): + """Loads the embedding matrix from a skip-thoughts model checkpoint. + + Args: + checkpoint_path: Model checkpoint file or directory containing a checkpoint + file. + + Returns: + word_embedding: A numpy array of shape [vocab_size, embedding_dim]. + + Raises: + ValueError: If no checkpoint file matches checkpoint_path. + """ + if tf.gfile.IsDirectory(checkpoint_path): + checkpoint_file = tf.train.latest_checkpoint(checkpoint_path) + if not checkpoint_file: + raise ValueError("No checkpoint file found in %s" % checkpoint_path) + else: + checkpoint_file = checkpoint_path + + tf.logging.info("Loading skip-thoughts embedding matrix from %s", + checkpoint_file) + reader = tf.train.NewCheckpointReader(checkpoint_file) + word_embedding = reader.get_tensor("word_embedding") + tf.logging.info("Loaded skip-thoughts embedding matrix of shape %s", + word_embedding.shape) + + return word_embedding + + +def _load_vocabulary(filename): + """Loads a vocabulary file. + + Args: + filename: Path to text file containing newline-separated words. + + Returns: + vocab: A dictionary mapping word to word id. + """ + tf.logging.info("Reading vocabulary from %s", filename) + vocab = collections.OrderedDict() + with tf.gfile.GFile(filename, mode="r") as f: + for i, line in enumerate(f): + word = line.strip().decode("utf-8") + assert word not in vocab, "Attempting to add word twice: %s" % word + vocab[word] = i + tf.logging.info("Read vocabulary of size %d", len(vocab)) + return vocab + + +def _expand_vocabulary(skip_thoughts_emb, skip_thoughts_vocab, word2vec): + """Runs vocabulary expansion on a skip-thoughts model using a word2vec model. + + Args: + skip_thoughts_emb: A numpy array of shape [skip_thoughts_vocab_size, + skip_thoughts_embedding_dim]. + skip_thoughts_vocab: A dictionary of word to id. + word2vec: An instance of gensim.models.KeyedVectors. + + Returns: + combined_emb: A dictionary mapping words to embedding vectors. + """ + # Find words shared between the two vocabularies. + tf.logging.info("Finding shared words") + shared_words = [w for w in word2vec.vocab if w in skip_thoughts_vocab] + + # Select embedding vectors for shared words. + tf.logging.info("Selecting embeddings for %d shared words", len(shared_words)) + shared_st_emb = skip_thoughts_emb[[ + skip_thoughts_vocab[w] for w in shared_words + ]] + shared_w2v_emb = word2vec[shared_words] + + # Train a linear regression model on the shared embedding vectors. + tf.logging.info("Training linear regression model") + model = sklearn.linear_model.LinearRegression() + model.fit(shared_w2v_emb, shared_st_emb) + + # Create the expanded vocabulary. + tf.logging.info("Creating embeddings for expanded vocabuary") + combined_emb = collections.OrderedDict() + for w in word2vec.vocab: + # Ignore words with underscores (spaces). + if "_" not in w: + w_emb = model.predict(word2vec[w].reshape(1, -1)) + combined_emb[w] = w_emb.reshape(-1) + + for w in skip_thoughts_vocab: + combined_emb[w] = skip_thoughts_emb[skip_thoughts_vocab[w]] + + tf.logging.info("Created expanded vocabulary of %d words", len(combined_emb)) + + return combined_emb + + +def main(unused_argv): + if not FLAGS.skip_thoughts_model: + raise ValueError("--skip_thoughts_model is required.") + if not FLAGS.skip_thoughts_vocab: + raise ValueError("--skip_thoughts_vocab is required.") + if not FLAGS.word2vec_model: + raise ValueError("--word2vec_model is required.") + if not FLAGS.output_dir: + raise ValueError("--output_dir is required.") + + if not tf.gfile.IsDirectory(FLAGS.output_dir): + tf.gfile.MakeDirs(FLAGS.output_dir) + + # Load the skip-thoughts embeddings and vocabulary. + skip_thoughts_emb = _load_skip_thoughts_embeddings(FLAGS.skip_thoughts_model) + skip_thoughts_vocab = _load_vocabulary(FLAGS.skip_thoughts_vocab) + + # Load the Word2Vec model. + word2vec = gensim.models.KeyedVectors.load_word2vec_format( + FLAGS.word2vec_model, binary=True) + + # Run vocabulary expansion. + embedding_map = _expand_vocabulary(skip_thoughts_emb, skip_thoughts_vocab, + word2vec) + + # Save the output. + vocab = embedding_map.keys() + vocab_file = os.path.join(FLAGS.output_dir, "vocab.txt") + with tf.gfile.GFile(vocab_file, "w") as f: + f.write("\n".join(vocab)) + tf.logging.info("Wrote vocabulary file to %s", vocab_file) + + embeddings = np.array(embedding_map.values()) + embeddings_file = os.path.join(FLAGS.output_dir, "embeddings.npy") + np.save(embeddings_file, embeddings) + tf.logging.info("Wrote embeddings file to %s", embeddings_file) + + +if __name__ == "__main__": + tf.app.run() diff --git a/unrolling_the_decoder.md b/unrolling_the_decoder.md new file mode 100644 index 0000000..1179416 --- /dev/null +++ b/unrolling_the_decoder.md @@ -0,0 +1,115 @@ +# Unrolling the Decoder + +## Introduction + +As discussed in the decoding decoders paper, the optimal space for an RNN plus softmax projection is +obtained by unrolling the decocer and using the concatentation of its hidden states as the +representation of the input (see figure below). +![Unrolling an RNN Decoder](/images/unroll.png) + +## Code + +In order use the decoder at inference time, in contrast to the original [TensorFlow SkipThoughts implementation](https://github.com/tensorflow/models/tree/master/research/skip_thoughts), we need to load the entire graph. + +First, we load the encoder. + +```{python} +flags.data_dir = ... +flags.uni_vocab_file = ... +flags.uni_embeddings_file = ... +flags.uni_checkpoint_path = ... +flags.decoder = ... + +decoder_config = experiments.get_decoder_config(flags=flags) + +uni_config = configuration.model_config( + sequence_decoder_pre=decoder_config.sequence_decoder_pre, + sequence_decoder_cur=decoder_config.sequence_decoder_cur, + sequence_decoder_post=decoder_config.sequence_decoder_post, + skipgram_decoder_pre=decoder_config.skipgram_decoder_pre, + skipgram_decoder_cur=decoder_config.skipgram_decoder_cur, + skipgram_decoder_post=decoder_config.skipgram_decoder_post) + +encoder = encoder_manager.EncoderManager() + +encoder.load_model(uni_config, + flags.uni_vocab_file, + flags.uni_embeddings_file, + flags.uni_checkpoint_path, + mode='encode-decode') +``` +We then pull the graph and session object from the encoder +```{python} +g, sess = encoder.graph, encoder.sessions[0] +``` +In order to perform the unrolling, we need the names of tensors involed in the unrolling process. +To help you, if tracking the names of tensors is diffcult, you can always modify the architecture post-training in such +a way that data-flow tensors aquire a speific name using `tf.identity` for example: +```{python} +tensor_to_name = tf.identity(tensor_to_name, name='name_of_tensor_to_name') +``` +Now this tensor can be accessed using the `get_tensor_by_name(...)` method of the `tf.Graph`. +In this case, the tensor would acquire the name `name_of_tensor_to_name:0` since it is the zero-th tensor produced +by the `tf.identity` op named `name_of_tensor_to_name`. + +For each decoder we need the following tensors: ++ The logits (for example `decoder_pre_logits:0`) ++ The decoder output (for example `decoder_pre_output:0`) ++ The decoder state (for example `decoder_pre_state:0`) +as well as the word embedding matrix, for example `word_embedding:0`. + +Using these, we can define dictionaries of the tensors necessary for the unrolling - one global dictionary, and one specific to each decoder +``` +tensor_names_global = { + 'word_embedding': 'word_embedding:0'} + +tensor_names_pre = { + 'logits': 'decoder_pre_logits:0', + 'decoder_output': 'decoder_pre_output:0', + 'decoder_state': 'decoder_pre_state:0'} + +tensor_names_post = { + 'logits': 'decoder_post_logits:0', + 'decoder_output': 'decoder_post_output:0', + 'decoder_state': 'decoder_post_state:0'} +``` +To build the decoders, we use these dictionaries to creeate instances of the [Decoder class](/skip_thoughts/decode.py) +``` +decoder_pre = decode.Decoder( + g=g, + tensor_names_decoder=tensor_names_pre, + tensor_names_global=tensor_names_global) + +decoder_post = decode.Decoder( + g=g, + tensor_names_decoder=tensor_names_post, + tensor_names_global=tensor_names_global) +``` +With this setup, we can now do some unrolling. +``` +batch_size = 2 +unroll_steps = 5 + +data = [ + "and wow", + "hey !", + "what's this thing suddenly coming towards me very fast ?", + "very very fast" + "so big and flat and round , it needs a big wide sounding name like ow ound round ground !" + "that's it !" + "that's a good name – ground !" + "i wonder if it will be friends with me ?"] + +decode_pre_rep, decode_post_rep = decode.decode( + sess=sess, data=data, + encoder=encoder, decoder_pre=decoder_pre, decoder_post=decoder_post, + batch_size=batch_size, use_norm=True, steps=unroll_steps) +``` +The vector representations `decode_pre_rep` and `decode_post_rep` are the unrolled representations for the prev and post decoders respectively. They are aligned in sentences, and can be concatenated to produce a single representation +``` +decode_rep_concat = np.concatenate( + (np.array(decode_pre_rep), np.array(decode_post_rep)), axis=1) +``` +which can then be used for downstream tasks, such as similarity and transfer. + +