From 6029e89bedd9d12a2aa70c911ff55127d6e49309 Mon Sep 17 00:00:00 2001 From: Shiva Rama Krishna <45482631+srkchowdary2000@users.noreply.github.com> Date: Sun, 21 May 2023 21:06:50 +0530 Subject: [PATCH] Update prepro.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update works to preprocess other than 2016 datasets also like IWSLT 2017 German–English parallel corpus ( https://huggingface.co/datasets/iwslt2017/resolve/main/data/2017-01-trnted/texts/de/en/de-en.zip ) --- prepro.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prepro.py b/prepro.py index c08c7c9..1decc25 100644 --- a/prepro.py +++ b/prepro.py @@ -35,7 +35,7 @@ def prepro(hp): logging.info("# Preprocessing") # train _prepro = lambda x: [line.strip() for line in open(x, 'r').read().split("\n") \ - if not line.startswith("<")] + if not (line.startswith("<") or line.startswith(" <"))] prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2) assert len(prepro_train1)==len(prepro_train2), "Check if train source and target files match." @@ -109,4 +109,4 @@ def _segment_and_write(sents, fname): parser = hparams.parser hp = parser.parse_args() prepro(hp) - logging.info("Done") \ No newline at end of file + logging.info("Done")