From 6029e89bedd9d12a2aa70c911ff55127d6e49309 Mon Sep 17 00:00:00 2001
From: Shiva Rama Krishna <45482631+srkchowdary2000@users.noreply.github.com>
Date: Sun, 21 May 2023 21:06:50 +0530
Subject: [PATCH] Update prepro.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update works to preprocess other than 2016 datasets also like IWSLT 2017 German–English parallel corpus ( https://huggingface.co/datasets/iwslt2017/resolve/main/data/2017-01-trnted/texts/de/en/de-en.zip )
---
 prepro.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prepro.py b/prepro.py
index c08c7c9..1decc25 100644
--- a/prepro.py
+++ b/prepro.py
@@ -35,7 +35,7 @@ def prepro(hp):
     logging.info("# Preprocessing")
     # train
     _prepro = lambda x:  [line.strip() for line in open(x, 'r').read().split("\n") \
-                      if not line.startswith("<")]
+                      if not (line.startswith("<") or line.startswith(" <"))]
     prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2)
     assert len(prepro_train1)==len(prepro_train2), "Check if train source and target files match."
 
@@ -109,4 +109,4 @@ def _segment_and_write(sents, fname):
     parser = hparams.parser
     hp = parser.parse_args()
     prepro(hp)
-    logging.info("Done")
\ No newline at end of file
+    logging.info("Done")