From 3c194716830b250eed1ca7c91269f4c9e817288d Mon Sep 17 00:00:00 2001 From: William Jiang Date: Wed, 3 Jan 2018 14:28:04 +0800 Subject: [PATCH] Build vocabulary script --- nmt/scripts/build_vocab.py | 100 +++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100755 nmt/scripts/build_vocab.py diff --git a/nmt/scripts/build_vocab.py b/nmt/scripts/build_vocab.py new file mode 100755 index 000000000..dd930235f --- /dev/null +++ b/nmt/scripts/build_vocab.py @@ -0,0 +1,100 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Utility to build vocabulary.""" + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import codecs +import time +import os + +def get_time(): + """Get local time""" + return time.strftime("%Y/%m/%d %H:%M:%S", time.localtime()) + +def basic_tokenizer(sentence, delimiter=None): + """A very simple tokenizer with space to tokenize a sentence. + Args: + sentence: a token sequence, which is pre-tokened. + dilimiter: delimiter string. If None, using default delimiter. + Return: + A word list. + """ + if delimiter==None: + words = [w for w in sentence.strip().split()] + else: + words = [w for w in sentence.strip().split(delimiter)] + return words + +def main(): + parser = argparse.ArgumentParser(description="Build vocabulary") + parser.add_argument( + "--data", default=None, + required=True, + help="Source text file") + parser.add_argument( + "--save_vocab", default=None, + required=True, + help="Output vocabulary file") + parser.add_argument( + "--min_frequency", default=1, + type=int, + help="Min word frequency, default 1") + parser.add_argument( + "--size", default=0, + type=int, + help="Max vocabulary size, if set 0, no limited, default 0") + parser.add_argument( + "--without_special_token", default=False, + help="If set true, the vocabulary will not contain special such as '','' etc." + ) + args = parser.parse_args() + # build vocabulary + if os.path.exists(args.data): + print(get_time()+" Build vocabulary...") + with codecs.open(args.data, "r", "utf-8") as data_f: + with codecs.open(args.save_vocab, "w", "utf-8") as vocab_f: + vocab_dict = {} + for line in data_f: + # space split + words = basic_tokenizer(line) + for w in words: + vocab_dict[w] = (vocab_dict[w]+1 if w in vocab_dict else 1) + # filter low frequency words + if args.min_frequency>1: + for w, v in vocab_dict.items(): + if v", "", ""] + vocab + # cut vocabulary to max vocabulary size + if args.size>0: + vocab = vocab[0:args.size] + # write vocab to vocabulary file + for word in vocab: + vocab_f.write(word+"\n") + print(get_time()+" %d words have been written into vocabulary file..." % len(vocab)) + else: + raise ValueError("%s doesn't exist!" % args.data) + +if __name__=="__main__": + main()