Fixed duplicate BOS token in training input.

PiperOrigin-RevId: 657230030 Change-Id: I0a4ddb7928887524276f287bf97c13051ca3ffb5
google-deepmind · Jul 30, 2024 · 20cb3da · 20cb3da
1 parent 4dbac72
commit 20cb3da
Showing 1 changed file with 15 additions and 3 deletions.
diff --git a/colabs/fine_tuning_tutorial.ipynb b/colabs/fine_tuning_tutorial.ipynb
@@ -258,6 +258,7 @@
         "               example: str | bytes,\n",
         "               prefix: str = '',\n",
         "               suffix: str = '',\n",
+        "               add_bos: bool = True,\n",
         "               add_eos: bool = True) -\u003e jax.Array:\n",
         "    \"\"\"\n",
         "    Tokenization function.\n",
@@ -266,12 +267,16 @@
         "      example: input string to tokenize.\n",
         "      prefix:  prefix to add to the input string.\n",
         "      suffix:  suffix to add to the input string.\n",
-        "      add_eos: if True, add an end of sentence token at the end of the output\n",
+        "      add_bos: if True, add a beginning of sequence token at the start of the\n",
+        "               tokenized sequence.\n",
+        "      add_eos: if True, add an end of sequence token at the end of the tokenized\n",
         "               sequence.\n",
         "    Returns:\n",
         "      Tokens corresponding to the input string.\n",
         "    \"\"\"\n",
-        "    int_list = [self._spm_processor.bos_id()]\n",
+        "    int_list = []\n",
+        "    if add_bos:\n",
+        "      int_list.append(self._spm_processor.bos_id())\n",
         "    int_list.extend(self._spm_processor.EncodeAsIds(prefix + example + suffix))\n",
         "    if add_eos:\n",
         "      int_list.append(self._spm_processor.eos_id())\n",
@@ -282,11 +287,12 @@
         "                     str_tensor: tf.Tensor,\n",
         "                     prefix: str = '',\n",
         "                     suffix: str = '',\n",
+        "                     add_bos: bool = True,\n",
         "                     add_eos: bool = True) -\u003e tf.Tensor:\n",
         "    \"\"\"Tensforflow operator for the tokenize function.\"\"\"\n",
         "    encoded = tf.numpy_function(\n",
         "        self.tokenize,\n",
-        "        [str_tensor, prefix, suffix, add_eos],\n",
+        "        [str_tensor, prefix, suffix, add_bos, add_eos],\n",
         "        tf.int32)\n",
         "    encoded.set_shape([None])\n",
         "    return encoded\n",
@@ -320,9 +326,11 @@
         "  return tokenizer.tokenize_tf_op(example,\n",
         "                                  prefix='Translate this into French:\\n',\n",
         "                                  suffix='\\n',\n",
+        "                                  add_bos=True,\n",
         "                                  add_eos=False)\n",
         "def tokenize_destination(tokenizer, example: tf.Tensor):\n",
         "  return tokenizer.tokenize_tf_op(example,\n",
+        "                                  add_bos=False,\n",
         "                                  add_eos=True)\n",
         "\n",
         "ds = tfds.load(\"mtnt/en-fr\",split=\"train\")\n",
@@ -399,14 +407,18 @@
         "\n",
         "  def _tokenize_source(self, example: tf.Tensor):\n",
         "    \"\"\"Tokenization function for the source.\"\"\"\n",
+        "    # We add \u003cBOS\u003e as these tokens are the start of our sequence.\n",
         "    return self._tokenizer.tokenize_tf_op(example,\n",
         "                                          prefix=self.TRANSLATION_PREFIX,\n",
         "                                          suffix=self.TRANSLATION_SUFFIX,\n",
+        "                                          add_bos=True,\n",
         "                                          add_eos=False)\n",
         "\n",
         "  def _tokenize_destination(self, example: tf.Tensor):\n",
         "    \"\"\"Tokenization function for the French translation.\"\"\"\n",
+        "    # We do not add \u003cBOS\u003e as these tokens get appended to the source tokens.\n",
         "    return self._tokenizer.tokenize_tf_op(example,\n",
+        "                                          add_bos=False,\n",
         "                                          add_eos=True)\n",
         "\n",
         "  def _pad_up_to_max_len(self,\n",